[llvm] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (PR #100067)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 24 00:06:45 PDT 2024
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/100067
>From ec5fda9e5fe818c716a6bd026424b803f1837f6c Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 22 Jul 2024 12:39:38 +0900
Subject: [PATCH 1/3] [AMDGPU] Mitigate GFX12 VALU read SGPR hazard
Any SGPR read by a VALU can potentially obscure SALU writes to the
same register. Insert s_wait_alu instructions to mitigate the
hazard on affected paths.
Compute a global cache of SGPRs with any VALU reads and use this to
avoid inserting mitigation for SGPRs never accessed by VALUs.
To avoid excessive search when compile time is priority implement
secondary mode where all SALU writes are mitigated.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 268 +++++-
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 3 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +
.../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 52 +-
.../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 52 +-
.../GlobalISel/clamp-fmed3-const-combine.ll | 8 +
.../GlobalISel/clamp-minmax-const-combine.ll | 17 +
.../GlobalISel/extractelement-stack-lower.ll | 3 +
.../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 20 +-
.../GlobalISel/fmed3-min-max-const-combine.ll | 18 +
.../llvm.amdgcn.global.atomic.csub.ll | 4 +
.../GlobalISel/llvm.amdgcn.rsq.clamp.ll | 28 +-
.../AMDGPU/GlobalISel/load-constant.96.ll | 10 +
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 9 +
llvm/test/CodeGen/AMDGPU/abs_i16.ll | 8 +
.../AMDGPU/atomic_optimizations_buffer.ll | 65 +-
.../atomic_optimizations_global_pointer.ll | 102 ++-
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 54 +-
.../atomic_optimizations_struct_buffer.ll | 54 +-
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 10 +-
.../buffer-fat-pointer-atomicrmw-fadd.ll | 125 ++-
.../buffer-fat-pointer-atomicrmw-fmax.ll | 164 +++-
.../buffer-fat-pointer-atomicrmw-fmin.ll | 164 +++-
.../test/CodeGen/AMDGPU/code-size-estimate.ll | 36 +-
.../fast-unaligned-load-store.global.ll | 3 +
.../fast-unaligned-load-store.private.ll | 12 +
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 7 +
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 179 +++-
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 206 ++++-
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 206 ++++-
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 232 +++--
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 46 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 86 +-
llvm/test/CodeGen/AMDGPU/fminimum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 86 +-
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 14 +
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 16 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 150 +++-
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 196 +++-
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 196 +++-
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 232 +++--
.../test/CodeGen/AMDGPU/global_atomics_i64.ll | 48 +-
.../hazard-recognizer-src-shared-base.ll | 23 +
.../AMDGPU/indirect-call-known-callees.ll | 17 +-
.../insert_waitcnt_for_precise_memory.ll | 48 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 57 ++
.../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 12 +
.../llvm.amdgcn.buffer.load-last-use.ll | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 28 +
.../CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll | 20 +
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 64 +-
.../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 16 +
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 14 +-
...amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll | 5 +
...m.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll | 5 +
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 26 +-
.../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 2 +
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 22 +-
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 24 +-
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 24 +-
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 24 +-
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 24 +-
.../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll | 3 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 25 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 22 +
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 22 +
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 25 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 22 +
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 22 +
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 4 +
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 49 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 54 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 1 +
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 115 ++-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 137 ++-
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 90 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 90 +-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 120 ++-
.../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 3 +-
.../lower-work-group-id-intrinsics-hsa.ll | 6 +-
.../lower-work-group-id-intrinsics-pal.ll | 6 +-
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 17 +
.../AMDGPU/memory-legalizer-flat-agent.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 8 +
.../memory-legalizer-flat-nontemporal.ll | 26 +
.../memory-legalizer-flat-singlethread.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-system.ll | 156 ++++
.../AMDGPU/memory-legalizer-flat-volatile.ll | 26 +
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 154 ++++
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 148 +++
.../AMDGPU/memory-legalizer-global-agent.ll | 150 ++++
.../AMDGPU/memory-legalizer-global-lastuse.ll | 8 +
.../memory-legalizer-global-nontemporal.ll | 18 +
.../memory-legalizer-global-singlethread.ll | 152 ++++
.../AMDGPU/memory-legalizer-global-system.ll | 142 +++
.../memory-legalizer-global-volatile.ll | 20 +
.../memory-legalizer-global-wavefront.ll | 152 ++++
.../memory-legalizer-global-workgroup.ll | 152 ++++
.../memory-legalizer-invalid-syncscope.ll | 1 +
.../AMDGPU/memory-legalizer-local-agent.ll | 120 +++
.../memory-legalizer-local-nontemporal.ll | 16 +
.../memory-legalizer-local-singlethread.ll | 120 +++
.../AMDGPU/memory-legalizer-local-system.ll | 120 +++
.../AMDGPU/memory-legalizer-local-volatile.ll | 14 +
.../memory-legalizer-local-wavefront.ll | 120 +++
.../memory-legalizer-local-workgroup.ll | 120 +++
.../memory-legalizer-private-lastuse.ll | 6 +
.../memory-legalizer-private-nontemporal.ll | 18 +
.../memory-legalizer-private-volatile.ll | 16 +
llvm/test/CodeGen/AMDGPU/mul.ll | 6 +
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 91 +-
.../CodeGen/AMDGPU/offset-split-global.ll | 91 +-
.../AMDGPU/pseudo-scalar-transcendental.ll | 66 +-
llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll | 8 +-
.../AMDGPU/select-flags-to-fmin-fmax.ll | 42 +
.../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 168 ++--
.../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 848 ++++++++++++++++++
.../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir | 5 +-
119 files changed, 7066 insertions(+), 1041 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a402fc6d7e611..9815b8c0d5d73 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -14,6 +14,8 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -1104,6 +1106,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixWMMAHazards(MI);
fixShift64HighRegBug(MI);
fixVALUMaskWriteHazard(MI);
+ fixVALUReadSGPRHazard(MI);
}
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -2759,6 +2762,36 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
return false;
}
+// Adjust global offsets for instructions bundled with S_GETPC_B64 after
+// insertion of a new instruction.
+static void updateGetPCBundle(MachineInstr *NewMI) {
+ if (!NewMI->isBundled())
+ return;
+
+ // Find start of bundle.
+ auto I = NewMI->getIterator();
+ while (I->isBundledWithPred())
+ I--;
+ if (I->isBundle())
+ I++;
+
+ // Bail if this is not an S_GETPC bundle.
+ if (I->getOpcode() != AMDGPU::S_GETPC_B64)
+ return;
+
+ // Update offsets of any references in the bundle.
+ const unsigned NewBytes = NewMI->getDesc().getSize();
+ auto NextMI = std::next(NewMI->getIterator());
+ auto End = NewMI->getParent()->end();
+ while (NextMI != End && NextMI->isBundledWithPred()) {
+ for (auto &Operand : NextMI->operands()) {
+ if (Operand.isGlobal())
+ Operand.setOffset(Operand.getOffset() + NewBytes);
+ }
+ NextMI++;
+ }
+}
+
bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
if (!ST.hasVALUMaskWriteHazard())
return false;
@@ -2876,22 +2909,235 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
auto NextMI = std::next(MI->getIterator());
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
- BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
- TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
// SALU write may be s_getpc in a bundle.
- if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
- // Update offsets of any references in the bundle.
- while (NextMI != MI->getParent()->end() &&
- NextMI->isBundledWithPred()) {
- for (auto &Operand : NextMI->operands()) {
- if (Operand.isGlobal())
- Operand.setOffset(Operand.getOffset() + 4);
+ updateGetPCBundle(NewMI);
+
+ return true;
+}
+
+static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+ unsigned RegN = TRI.getEncodingValue(Reg);
+ assert(RegN <= 127);
+ return (RegN >> 1) & 0x3f;
+}
+
+// For VALUReadSGPRHazard: pre-compute a bit vector of all SGPRs used by VALUs.
+void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
+ assert(MMF == &MF);
+
+ // Assume non-empty vector means it has already been computed.
+ if (!VALUReadHazardSGPRs.empty())
+ return;
+
+ // Consider all SGPRs hazards if the shader uses function calls or is callee.
+ auto CallingConv = MF.getFunction().getCallingConv();
+ bool UseVALUUseCache = AMDGPU::isEntryFunctionCC(CallingConv) &&
+ !MF.getFrameInfo().hasCalls() &&
+ MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+
+ VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
+ if (!UseVALUUseCache)
+ return;
+
+ // Perform a post ordered reverse scan to find VALUs which read an SGPR
+ // before a SALU write to the same SGPR. This provides a reduction in
+ // hazard insertion when all VALU access to an SGPR occurs after its last
+ // SALU write, when compared to a linear scan.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
+ MachineCycleInfo CI;
+ CI.compute(*MMF);
+
+ for (auto *MBB : post_order(&MF)) {
+ bool InCycle = CI.getCycle(MBB) != nullptr;
+ for (auto &MI : reverse(MBB->instrs())) {
+ bool IsVALU = SIInstrInfo::isVALU(MI);
+ bool IsSALU = SIInstrInfo::isSALU(MI);
+ if (!(IsVALU || IsSALU))
+ continue;
+
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ // Only consider implicit operands of VCC.
+ if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
+ Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
+ continue;
+ if (!TRI.isSGPRReg(MRI, Reg))
+ continue;
+ if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
+ continue;
+ unsigned RegN = baseSGPRNumber(Reg, TRI);
+ if (IsVALU && Op.isUse()) {
+ // Note: any access within a cycle must be considered a hazard.
+ if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
+ VALUReadHazardSGPRs.set(RegN);
+ ReadSGPRs.set(RegN);
+ } else if (IsSALU) {
+ if (Op.isDef())
+ SALUWriteSGPRs.set(RegN);
+ else
+ ReadSGPRs.set(RegN);
+ }
}
- NextMI++;
}
}
+}
+
+bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
+ if (!ST.hasVALUReadSGPRHazard())
+ return false;
+
+ // The hazard sequence is fundamentally three instructions:
+ // 1. VALU reads SGPR
+ // 2. SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // We do not search for (1) because the expiry point of the hazard
+ // is indeterminate; however, the hazard between (2) and (3) can
+ // expire if the gap contains sufficient SALU instructions with no
+ // usage of SGPR from (1).
+ // Note: SGPRs must be considered as 64-bit pairs as hazard exists
+ // even if individual SGPRs are accessed.
+
+ bool MIIsSALU = SIInstrInfo::isSALU(*MI);
+ bool MIIsVALU = SIInstrInfo::isVALU(*MI);
+ if (!(MIIsSALU || MIIsVALU))
+ return false;
+
+ // Always mitigate before a call/return as the callee/caller will not
+ // see the hazard chain, i.e. (2) to (3) described above.
+ if (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_CALL_B64) {
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ return true;
+ }
+
+ // Avoid expensive search when compile time is priority by
+ // mitigating every SALU which writes an SGPR.
+ if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
+ if (!SIInstrInfo::isSALU(*MI) || SIInstrInfo::isSOPP(*MI))
+ return false;
+
+ const MachineOperand *SDSTOp =
+ TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
+ if (!SDSTOp || !SDSTOp->isReg())
+ return false;
+
+ const Register HazardReg = SDSTOp->getReg();
+ if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
+ HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
+ return false;
+
+ // Add s_wait_alu sa_sdst(0) after SALU write.
+ auto NextMI = std::next(MI->getIterator());
+ auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+ // SALU write may be s_getpc in a bundle.
+ updateGetPCBundle(NewMI);
+
+ return true;
+ }
+
+ // Pre-compute set of SGPR pairs read by VALUs.
+ // Note: pass mutable pointer to MachineFunction for CycleInfo.
+ computeVALUHazardSGPRs(MI->getMF());
+
+ // If no VALUs hazard SGPRs exist then nothing to do.
+ if (VALUReadHazardSGPRs.none())
+ return false;
+
+ // Collect all SGPR sources for MI which are read by a VALU.
+ const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ SmallSet<Register, 4> SGPRsUsed;
+
+ for (const MachineOperand &Op : MI->all_uses()) {
+ Register OpReg = Op.getReg();
+
+ // Only consider VCC implicit uses on VALUs.
+ // The only expected SALU implicit access is SCC which is no hazard.
+ if (MIIsSALU && Op.isImplicit())
+ continue;
+
+ if (!TRI.isSGPRReg(MRI, OpReg))
+ continue;
+
+ // Ignore special purposes registers such as NULL, EXEC, and M0.
+ if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
+ continue;
+
+ unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ if (!VALUReadHazardSGPRs[RegN])
+ continue;
+
+ SGPRsUsed.insert(OpReg);
+ }
+
+ // No SGPRs -> nothing to do.
+ if (SGPRsUsed.empty())
+ return false;
+
+ // A hazard is any SALU which writes one of the SGPRs read by MI.
+ auto IsHazardFn = [this, &SGPRsUsed](const MachineInstr &I) {
+ if (!SIInstrInfo::isSALU(I))
+ return false;
+ // Check for any register writes.
+ return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return I.modifiesRegister(Reg, &TRI);
+ });
+ };
+
+ const int SALUExpiryCount = SIInstrInfo::isSALU(*MI) ? 10 : 11;
+ auto IsExpiredFn = [&](const MachineInstr &I, int Count) {
+ if (Count >= SALUExpiryCount)
+ return true;
+ // s_wait_alu sa_sdst(0) on path mitigates hazard.
+ if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
+ return true;
+ return false;
+ };
+
+ auto WaitStatesFn = [this, &SGPRsUsed](const MachineInstr &I) {
+ // Only count true SALUs as wait states.
+ if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
+ return 0;
+ // SALU must be unrelated to any hazard registers.
+ if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return I.readsRegister(Reg, &TRI);
+ }))
+ return 0;
+ return 1;
+ };
+
+ // Check for the hazard.
+ DenseSet<const MachineBasicBlock *> Visited;
+ int WaitStates = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
+ std::next(MI->getReverseIterator()), 0,
+ IsExpiredFn, Visited, WaitStatesFn);
+
+ if (WaitStates >= SALUExpiryCount)
+ return false;
+
+ // Add s_wait_alu sa_sdst(0) before SALU read.
+ auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII.get(AMDGPU::S_WAITCNT_DEPCTR))
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+
+ // SALU read may be after s_getpc in a bundle.
+ updateGetPCBundle(NewMI);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 3ccca527c626b..31d5b3d517193 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -48,6 +48,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
const SIRegisterInfo &TRI;
TargetSchedModel TSchedModel;
bool RunLdsBranchVmemWARHazardFixup;
+ BitVector VALUReadHazardSGPRs;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
@@ -107,6 +108,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
bool fixWMMAHazards(MachineInstr *MI);
bool fixShift64HighRegBug(MachineInstr *MI);
bool fixVALUMaskWriteHazard(MachineInstr *MI);
+ void computeVALUHazardSGPRs(MachineFunction *MMF);
+ bool fixVALUReadSGPRHazard(MachineInstr *MI);
int checkMAIHazards(MachineInstr *MI);
int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e5817594a4521..1d151432f20b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1245,6 +1245,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
+ bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
+
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index c701e873fdd2c..40fc7139c2646 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -23,6 +23,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
@@ -96,6 +97,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32:
@@ -169,6 +171,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64:
@@ -246,6 +249,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64:
@@ -334,13 +338,15 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -550,12 +556,14 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -758,13 +766,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -986,12 +996,14 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1200,13 +1212,15 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1411,12 +1425,14 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1614,13 +1630,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1841,12 +1859,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2057,12 +2077,14 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2307,12 +2329,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2551,12 +2575,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2824,12 +2850,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 90110e6e0c09e..ddb3b7d11185f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -23,6 +23,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32:
@@ -96,6 +97,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32:
@@ -169,6 +171,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64:
@@ -246,6 +249,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64:
@@ -334,13 +338,15 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -550,12 +556,14 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -758,13 +766,15 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -986,12 +996,14 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1200,13 +1212,15 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1411,12 +1425,14 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1614,13 +1630,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1841,12 +1859,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -2057,12 +2077,14 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -2307,12 +2329,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -2551,12 +2575,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -2824,12 +2850,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index c7676e9da6f49..889a33432be06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -17,6 +17,7 @@ define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call nnan float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -38,6 +39,7 @@ define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%fmed = call nnan half @llvm.amdgcn.fmed3.f16(half %fmul, half 0.0, half 1.0)
@@ -63,6 +65,7 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmin = call float @llvm.minnum.f32(float %a, float 10.0)
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -85,6 +88,7 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -108,6 +112,7 @@ define float @test_fmed3_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -135,6 +140,7 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -161,6 +167,7 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmin = call float @llvm.minnum.f32(float %a, float 10.0)
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -183,6 +190,7 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index ca0047bba6c4b..55e38e3a32162 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -17,6 +17,7 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call nnan float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -39,6 +40,7 @@ define double @test_min_max_K0Val_K1_f64(double %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul double %a, 2.0
%maxnum = call nnan double @llvm.maxnum.f64(double 0.0, double %fmul)
@@ -62,6 +64,7 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0)
@@ -84,6 +87,7 @@ define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half 0.0>, <2 x half> %fmul)
@@ -106,6 +110,7 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half undef>, <2 x half> %fmul)
@@ -130,6 +135,7 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call nnan float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -152,6 +158,7 @@ define double @test_max_min_K1Val_K0_f64(double %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul double %a, 2.0
%minnum = call nnan double @llvm.minnum.f64(double 1.0, double %fmul)
@@ -174,6 +181,7 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%minnum = call nnan half @llvm.minnum.f16(half %fmul, half 1.0)
@@ -197,6 +205,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%minnum = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> <half 1.0, half undef>, <2 x half> %fmul)
@@ -221,6 +230,7 @@ define float @test_min_max_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
@@ -242,6 +252,7 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 1.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
@@ -269,6 +280,7 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 1.0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 0.0)
@@ -292,6 +304,7 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0, 1.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 1.0)
@@ -318,6 +331,7 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -342,6 +356,7 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -370,6 +385,7 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -396,6 +412,7 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 43f3dcc86f426..21f8e188ff3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -31,6 +31,7 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 %idx
@@ -63,6 +64,7 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_u16 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <128 x i16>, ptr addrspace(1) %ptr
%elt = extractelement <128 x i16> %vec, i32 %idx
@@ -95,6 +97,7 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <32 x i64>, ptr addrspace(1) %ptr
%elt = extractelement <32 x i64> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a5e4151bf3695..9c166cc0e6222 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -83,9 +83,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -265,6 +266,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
@@ -316,6 +318,7 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -409,9 +412,10 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -622,6 +626,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -725,9 +730,10 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -942,6 +948,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -1113,6 +1120,7 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
@@ -1267,6 +1275,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -1332,6 +1341,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -1409,6 +1419,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1416,6 +1427,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -1499,6 +1511,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_mov_b32 s2, 3
; GFX12-NEXT: s_mov_b32 s1, 2
; GFX12-NEXT: s_mov_b32 s0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1506,6 +1519,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index 75c4cd53e3bfc..183a657df83c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -24,6 +24,7 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -51,6 +52,7 @@ define float @test_min_max_K0Val_K1_f32(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float 2.0, float %a)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -85,6 +87,7 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 {
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call half @llvm.maxnum.f16(half %a, half 2.0)
%fmed = call half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -113,6 +116,7 @@ define half @test_min_K1max_K0Val_f16(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan half @llvm.maxnum.f16(half 2.0, half %a)
%fmed = call nnan half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -141,6 +145,7 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -168,6 +173,7 @@ define float @test_max_min_K1Val_K0_f32(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float 4.0, float %a)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -196,6 +202,7 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan half @llvm.minnum.f16(half %a, half 4.0)
%fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -224,6 +231,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan half @llvm.minnum.f16(half 4.0, half %a)
%fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -253,6 +261,7 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -280,6 +289,7 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -314,6 +324,7 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 4.0, 2.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 4.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 2.0)
@@ -344,6 +355,7 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 4.0)
@@ -374,6 +386,7 @@ define float @test_min_max_non_inline_const(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 2.0, 0x41000000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 8.0)
@@ -407,6 +420,7 @@ define double @test_min_max_f64(double %a) #0 {
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 2.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan double @llvm.maxnum.f64(double %a, double 2.0)
%fmed = call nnan double @llvm.minnum.f64(double %maxnum, double 4.0)
@@ -443,6 +457,7 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
%fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> %maxnum, <2 x half> <half 4.0, half 4.0>)
@@ -477,6 +492,7 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -510,6 +526,7 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -544,6 +561,7 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 59818b0b1bc39..623616740f9be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -28,6 +28,7 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
ret i32 %ret
@@ -61,6 +62,7 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
@@ -91,6 +93,7 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
ret void
@@ -124,6 +127,7 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index 3def36766fbe0..cbc143b738950 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -29,6 +29,7 @@ define float @v_rsq_clamp_f32(float %src) #0 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
@@ -60,6 +61,7 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
@@ -94,11 +96,13 @@ define double @v_rsq_clamp_f64(double %src) #0 {
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
@@ -132,11 +136,13 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
; GFX12-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call double @llvm.fabs.f64(double %src)
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
@@ -169,6 +175,7 @@ define float @v_rsq_clamp_undef_f32() #0 {
; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
ret float %rsq_clamp
@@ -202,11 +209,13 @@ define double @v_rsq_clamp_undef_f64() #0 {
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], s[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
ret double %rsq_clamp
@@ -238,6 +247,7 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
@@ -271,11 +281,13 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], s[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb104311a4d8..a0853f9b9808f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -18,6 +18,7 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -62,6 +63,7 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX12-NOUNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -232,6 +234,7 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -254,6 +257,7 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; GFX12-NOUNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -355,6 +359,7 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
@@ -401,6 +406,7 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
@@ -447,6 +453,7 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
@@ -493,6 +500,7 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
@@ -560,6 +568,7 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
; GFX12-NEXT: v_mov_b32_e32 v8, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v12i8_align8:
@@ -641,6 +650,7 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index b0f3eee3c7363..ca9685d9a0f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -79,6 +79,7 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -164,6 +165,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -253,6 +255,7 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -298,6 +301,7 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
@@ -349,6 +353,7 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
@@ -509,6 +514,7 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
@@ -691,6 +697,7 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1039,6 +1046,7 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
@@ -2469,6 +2477,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index daed0986fa9c8..3124d5e7ed7d8 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -63,6 +63,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX12-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_i16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
@@ -138,6 +139,7 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX12-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
@@ -231,6 +233,7 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
@@ -339,6 +342,7 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %arg, i1 false)
ret <4 x i16> %res
@@ -478,6 +482,7 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v1, v1, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_pk_max_i16 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <6 x i16> @llvm.abs.v6i16(<6 x i16> %arg, i1 false)
ret <6 x i16> %res
@@ -650,6 +655,7 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_pk_max_i16 v2, v2, v6
; GFX12-NEXT: v_pk_max_i16 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %arg, i1 false)
ret <8 x i16> %res
@@ -943,6 +949,7 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v3, v3, v8
; GFX12-NEXT: v_pk_max_i16 v4, v4, v9
; GFX12-NEXT: v_pk_max_i16 v5, v5, v10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
ret <16 x i16> %res
@@ -1487,6 +1494,7 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v13, v13, v18
; GFX12-NEXT: v_pk_max_i16 v14, v14, v19
; GFX12-NEXT: v_pk_max_i16 v15, v15, v20
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index b1134ae78cb97..36ff06b2af57a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -224,6 +224,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -233,8 +234,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -256,6 +258,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -264,8 +267,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
@@ -506,6 +510,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -516,8 +521,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -539,6 +545,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -548,8 +555,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -877,7 +885,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -898,6 +906,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB2_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -923,22 +932,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -947,6 +959,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1288,7 +1301,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -1311,6 +1324,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b32 s5, s[2:3], 0x44
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mov_b32_e32 v2, s5
@@ -1337,22 +1351,25 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB3_4
; GFX12W32-NEXT: ; %bb.3:
@@ -1363,6 +1380,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s8
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1716,6 +1734,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1725,8 +1744,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1749,6 +1769,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1757,8 +1778,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
@@ -2003,6 +2025,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2013,8 +2036,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
@@ -2037,6 +2061,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2046,8 +2071,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
@@ -2377,7 +2403,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -2398,6 +2424,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB7_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2423,22 +2450,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB7_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2447,6 +2477,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index bc5d2662dcb45..9398d686c4475 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -264,6 +264,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -273,8 +274,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v1, s6
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
@@ -301,6 +304,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -309,8 +313,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: ; %bb.1:
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mov_b32_e32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -595,6 +601,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -634,6 +641,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1012,7 +1020,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
@@ -1033,6 +1041,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
@@ -1064,23 +1073,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6
; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB2_4
; GFX1232_ITERATIVE-NEXT: ; %bb.3:
@@ -1095,6 +1107,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: .LBB2_4:
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
@@ -1520,6 +1533,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1527,12 +1541,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
@@ -1550,6 +1566,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB2_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
@@ -1590,13 +1607,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -1604,6 +1622,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
@@ -1899,6 +1918,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1911,6 +1931,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_wait_kmcnt 0x0
@@ -1939,6 +1960,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1948,8 +1970,10 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: ; %bb.1:
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -2287,6 +2311,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2299,6 +2324,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
@@ -2330,6 +2356,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-NEXT: s_mov_b32 s3, 0
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2350,6 +2377,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
@@ -2764,7 +2792,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
@@ -2786,6 +2814,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB5_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
@@ -2820,8 +2849,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
@@ -2829,6 +2859,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3595,6 +3626,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -3602,6 +3634,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
@@ -3620,6 +3653,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB5_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -3693,12 +3727,14 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
@@ -3716,6 +3752,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_DPP-NEXT: .LBB5_2:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -3988,6 +4025,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3997,8 +4035,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_mul_i32 s6, s6, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v1, s6
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
@@ -4026,6 +4066,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4034,8 +4075,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: ; %bb.1:
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_i32 s5, s5, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mov_b32_e32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -4325,6 +4368,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4365,6 +4409,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -4744,7 +4789,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s7
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
@@ -4765,6 +4810,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v1, s6
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s10, -1
@@ -4796,23 +4842,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s5, v1, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s6
; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s4, s4, s5
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1232_ITERATIVE-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_cbranch_execz .LBB8_4
; GFX1232_ITERATIVE-NEXT: ; %bb.3:
@@ -4827,6 +4876,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: .LBB8_4:
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
@@ -5252,6 +5302,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v1, 31
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s6, 16
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5259,12 +5310,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1264_DPP-NEXT: v_readlane_b32 s9, v1, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s7, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1264_DPP-NEXT: s_mov_b32 s4, s9
; GFX1264_DPP-NEXT: v_writelane_b32 v3, s8, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1264_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
@@ -5282,6 +5335,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB8_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v0
@@ -5322,13 +5376,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -5336,6 +5391,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
; GFX1232_DPP-NEXT: ; %bb.1:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX1232_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
@@ -5645,6 +5701,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5657,6 +5714,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s6
; GFX1264-NEXT: v_mov_b32_e32 v1, s7
; GFX1264-NEXT: s_wait_kmcnt 0x0
@@ -5688,6 +5746,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5697,8 +5756,10 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: ; %bb.1:
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
@@ -6052,6 +6113,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6064,6 +6126,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
; GFX1264-NEXT: s_mov_b32 s10, -1
@@ -6099,6 +6162,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-NEXT: s_mov_b32 s3, 0
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6119,6 +6183,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB10_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
@@ -6537,7 +6602,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
-; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
@@ -6559,6 +6624,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX1264_ITERATIVE-NEXT: s_cbranch_execz .LBB11_4
; GFX1264_ITERATIVE-NEXT: ; %bb.3:
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, s5
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s11, 0x31016000
@@ -6593,8 +6659,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
-; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
@@ -6602,6 +6669,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s4, s1
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -7368,6 +7436,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s7, v3, 63
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s9, 32
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1264_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7375,6 +7444,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX1264_DPP-NEXT: v_writelane_b32 v1, s10, 48
; GFX1264_DPP-NEXT: v_writelane_b32 v2, s11, 48
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[8:9]
; GFX1264_DPP-NEXT: s_mov_b32 s6, -1
; GFX1264_DPP-NEXT: s_mov_b64 s[8:9], exec
@@ -7393,6 +7463,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1264_DPP-NEXT: .LBB11_2:
+; GFX1264_DPP-NEXT: s_wait_alu 0xfffe
; GFX1264_DPP-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s2, v8
@@ -7466,12 +7537,14 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1232_DPP-NEXT: v_writelane_b32 v2, s8, 16
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
; GFX1232_DPP-NEXT: s_mov_b32 s8, exec_lo
@@ -7489,6 +7562,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
; GFX1232_DPP-NEXT: .LBB11_2:
+; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: v_readfirstlane_b32 s2, v8
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index f636fa5d83a57..96309ccddb4ea 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -223,6 +223,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -232,8 +233,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -255,6 +257,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -263,8 +266,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
@@ -505,6 +509,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -515,8 +520,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -538,6 +544,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -547,8 +554,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -876,7 +884,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -897,6 +905,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB2_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -922,22 +931,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -946,6 +958,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1299,6 +1312,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1308,8 +1322,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -1332,6 +1347,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1340,8 +1356,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
@@ -1586,6 +1603,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1596,8 +1614,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
@@ -1620,6 +1639,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1629,8 +1649,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
@@ -1960,7 +1981,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -1981,6 +2002,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_cbranch_execz .LBB6_4
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
@@ -2006,22 +2028,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB6_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2030,6 +2055,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 3e8565d34c6be..f6b9f8ba058dd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -230,6 +230,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -240,8 +241,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -263,6 +265,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -271,8 +274,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
@@ -520,6 +524,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -531,8 +536,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
@@ -554,6 +560,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -563,8 +570,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
@@ -898,7 +906,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -920,6 +928,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -945,22 +954,25 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB2_4
; GFX12W32-NEXT: ; %bb.3:
@@ -969,6 +981,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1463,6 +1476,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1473,8 +1487,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -1497,6 +1512,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1505,8 +1521,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
@@ -1758,6 +1775,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1769,8 +1787,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
-; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
@@ -1793,6 +1812,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1802,8 +1822,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s4, s0, s4
-; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
@@ -2139,7 +2160,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12W64-NEXT: s_ctz_i32_b64 s5, s[0:1]
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s5
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s5
; GFX12W64-NEXT: v_writelane_b32 v0, s4, s5
@@ -2161,6 +2182,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
@@ -2186,22 +2208,25 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s4, s1
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s5, v1, s4
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s4
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s4
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s5
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12W32-NEXT: s_cbranch_execz .LBB7_4
; GFX12W32-NEXT: ; %bb.3:
@@ -2210,6 +2235,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 22e00b2f5a6b1..0fe447597d4ba 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -108,13 +108,15 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SYS
; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB0_1
; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: v_mov_b32_e32 v0, v3
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val seq_cst
ret float %res
@@ -219,6 +221,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret float %res
@@ -354,6 +357,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret void
@@ -455,13 +459,15 @@ define float @no_unsafe(ptr %addr, float %val) {
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
; GFX1200-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB3_1
; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: v_mov_b32_e32 v0, v3
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 23e8f98a7861b..6b5d1eb4cd41c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -27,6 +27,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
@@ -240,6 +241,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
@@ -436,13 +438,16 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -450,6 +455,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
@@ -848,7 +854,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -866,12 +872,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
@@ -1122,7 +1130,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
@@ -1139,12 +1147,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
@@ -1395,11 +1405,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2:
@@ -1424,11 +1437,14 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
@@ -1438,11 +1454,12 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB5_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
@@ -1914,14 +1931,17 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v5, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
@@ -1934,6 +1954,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
@@ -1943,12 +1964,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
@@ -2284,14 +2306,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
@@ -2304,6 +2329,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
@@ -2313,11 +2339,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
@@ -2659,10 +2686,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
@@ -2692,11 +2722,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2706,12 +2739,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: v_mov_b32_e32 v7, v8
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
@@ -3284,14 +3318,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -3312,6 +3349,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -3321,12 +3359,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
@@ -3709,14 +3748,17 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -3737,6 +3779,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3746,11 +3789,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
@@ -4138,10 +4182,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
@@ -4181,11 +4228,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4195,12 +4245,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
@@ -4824,6 +4875,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -5096,6 +5148,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5354,13 +5407,16 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2:
@@ -5368,6 +5424,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
@@ -5850,6 +5907,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
@@ -6258,6 +6316,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
@@ -6662,13 +6721,16 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2:
@@ -6676,6 +6738,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
@@ -7314,7 +7377,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -7332,12 +7395,14 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index ec0408236975d..f299b5474f100 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -27,6 +27,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
@@ -227,6 +228,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
@@ -428,13 +430,16 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -442,6 +447,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
@@ -815,7 +821,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -835,12 +841,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
@@ -1032,7 +1040,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -1051,12 +1059,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
@@ -1252,11 +1262,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2:
@@ -1284,11 +1297,14 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
@@ -1298,11 +1314,12 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB5_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
@@ -1670,14 +1687,17 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
@@ -1690,8 +1710,9 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -1701,12 +1722,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
@@ -2056,14 +2078,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
@@ -2076,8 +2101,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -2087,11 +2113,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
@@ -2446,10 +2473,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
@@ -2483,11 +2513,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2497,12 +2530,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
@@ -3088,14 +3122,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -3116,6 +3153,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -3125,12 +3163,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
@@ -3515,14 +3554,17 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -3543,6 +3585,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3552,11 +3595,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
@@ -3946,10 +3990,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
@@ -3989,11 +4036,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4003,12 +4053,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
@@ -4630,8 +4681,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -4649,12 +4701,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
@@ -4974,6 +5028,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -4991,12 +5046,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
@@ -5316,11 +5373,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2:
@@ -5348,11 +5408,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
@@ -5362,12 +5425,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB14_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
@@ -5923,8 +5987,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -5958,12 +6022,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
@@ -6368,7 +6434,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
@@ -6400,12 +6466,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
@@ -6809,11 +6877,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2:
@@ -6856,11 +6927,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
@@ -6870,12 +6944,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB17_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
@@ -7515,7 +7590,8 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -7533,12 +7609,14 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index cd01cc7309fcd..8418bcbbd760f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -27,6 +27,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
@@ -227,6 +228,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
@@ -428,13 +430,16 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
@@ -442,6 +447,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall:
@@ -815,7 +821,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
@@ -835,12 +841,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset:
@@ -1032,7 +1040,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v6, s4
; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -1051,12 +1059,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset:
@@ -1252,11 +1262,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2:
@@ -1284,11 +1297,14 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
@@ -1298,11 +1314,12 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall(ptr a
; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB5_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall:
@@ -1670,14 +1687,17 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
@@ -1690,8 +1710,9 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -1701,12 +1722,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset(ptr addrspace(7) i
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset:
@@ -2056,14 +2078,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
@@ -2076,8 +2101,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -2087,11 +2113,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset(ptr addrspace(7)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset:
@@ -2446,10 +2473,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
@@ -2483,11 +2513,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
@@ -2497,12 +2530,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall(ptr add
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall:
@@ -3088,14 +3122,17 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
@@ -3116,6 +3153,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
@@ -3125,12 +3163,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset:
@@ -3515,14 +3554,17 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_addk_co_i32 s6, 0x200
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s4, s6, -4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s4, s6, 3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_not_b32 s6, s5
; GFX12-NEXT: s_mov_b32 s5, 0
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
@@ -3543,6 +3585,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
@@ -3552,11 +3595,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset(ptr addrspace(7
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset:
@@ -3946,10 +3990,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2:
@@ -3989,11 +4036,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
@@ -4003,12 +4053,13 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall(ptr
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB11_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall:
@@ -4630,8 +4681,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -4649,12 +4701,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset:
@@ -4974,6 +5028,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: v_mov_b32_e32 v1, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
@@ -4991,12 +5046,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset:
@@ -5316,11 +5373,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2:
@@ -5348,11 +5408,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
@@ -5362,12 +5425,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB14_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall:
@@ -5923,8 +5987,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
@@ -5958,12 +6022,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset:
@@ -6368,7 +6434,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s5, 0
@@ -6400,12 +6466,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset:
@@ -6809,11 +6877,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2:
@@ -6856,11 +6927,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_4
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
@@ -6870,12 +6944,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB17_3
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall:
@@ -7515,7 +7590,8 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
; GFX12-NEXT: s_mov_b32 s4, 0
@@ -7533,12 +7609,14 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index ac03d2dae8fa8..0d7d6e8331418 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -34,12 +34,13 @@ define float @v_mul_f32_vop2(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, %y
ret float %mul
}
; NOT-GFX12: codeLenInByte = 12
-; GFX1200: codeLenInByte = 28
+; GFX1200: codeLenInByte = 32
define float @v_mul_f32_vop2_inline_imm(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_inline_imm:
@@ -68,12 +69,13 @@ define float @v_mul_f32_vop2_inline_imm(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, 4.0
ret float %mul
}
; NOT-GFX12: codeLenInByte = 12
-; GFX1200: codeLenInByte = 28
+; GFX1200: codeLenInByte = 32
define float @v_mul_f32_vop2_literal(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_literal:
@@ -102,12 +104,13 @@ define float @v_mul_f32_vop2_literal(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, 123.0
ret float %mul
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods:
@@ -136,13 +139,14 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, %y
ret float %mul
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -171,6 +175,7 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 4.0
@@ -178,7 +183,7 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -208,6 +213,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 123.0
@@ -217,7 +223,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 20
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 40
define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_frame_index:
@@ -248,6 +254,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%alloca = alloca i32, addrspace(5)
%ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
@@ -259,7 +266,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX9: codeLenInByte = 20
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 12
-; GFX1200: codeLenInByte = 28
+; GFX1200: codeLenInByte = 32
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX9-LABEL: v_fma_f32:
@@ -288,13 +295,14 @@ define float @v_fma_f32(float %x, float %y, float %z) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float %z)
ret float %fma
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
; GFX9-LABEL: v_fma_f32_src_mods:
@@ -323,6 +331,7 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
@@ -330,7 +339,7 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_fmac_f32(float %x, float %y) {
; GFX9-LABEL: v_fmac_f32:
@@ -359,6 +368,7 @@ define float @v_fmac_f32(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float %x)
ret float %fma
@@ -367,7 +377,7 @@ define float @v_fmac_f32(float %x, float %y) {
; GFX9: codeLenInByte = 16
; GFX10: codeLenInByte = 12
; GFX11: codeLenInByte = 12
-; GFX1200: codeLenInByte = 28
+; GFX1200: codeLenInByte = 32
define float @v_fmaak_f32(float %x, float %y) {
; GFX9-LABEL: v_fmaak_f32:
@@ -397,6 +407,7 @@ define float @v_fmaak_f32(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
ret float %fma
@@ -405,7 +416,7 @@ define float @v_fmaak_f32(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 16
; GFX11: codeLenInByte = 16
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 36
define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX9-LABEL: v_fma_k_f32_src_mods:
@@ -435,6 +446,7 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
+; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float 256.0)
@@ -444,7 +456,7 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 20
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 40
define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
; GFX9-LABEL: s_fmaak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 7252c69cb1cf7..90cc7f2c38599 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -58,6 +58,7 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 2
@@ -204,6 +205,7 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 1
@@ -343,6 +345,7 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index f9694dcd89abf..fc4a9892ca2dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -82,6 +82,7 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2:
@@ -93,6 +94,7 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -178,6 +180,7 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2:
@@ -189,6 +192,7 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 2
@@ -278,6 +282,7 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1:
@@ -289,6 +294,7 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 1
@@ -379,6 +385,7 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1:
@@ -390,6 +397,7 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 1
@@ -464,6 +472,7 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4:
@@ -475,6 +484,7 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 4
@@ -557,6 +567,7 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4:
@@ -568,6 +579,7 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
+; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index f0ce96af90649..a260caafa3afe 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -3030,6 +3030,7 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
ret <2 x float> %canon
@@ -3069,6 +3070,7 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
ret <3 x float> %canon
@@ -3110,6 +3112,7 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
ret <4 x float> %canon
@@ -3163,6 +3166,7 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
ret <8 x float> %canon
@@ -3199,6 +3203,7 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
ret <2 x double> %canon
@@ -3239,6 +3244,7 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
ret <3 x double> %canon
@@ -3283,6 +3289,7 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
ret <4 x double> %canon
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index c5c44d27efbb3..b2494e394f7ec 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -25,6 +25,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -202,6 +203,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -389,6 +391,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -584,6 +587,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -791,6 +795,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1009,6 +1014,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1241,13 +1247,15 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1427,12 +1435,14 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1608,12 +1618,14 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
@@ -1779,6 +1791,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
@@ -1997,6 +2010,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -2215,6 +2229,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
@@ -2437,6 +2452,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2614,6 +2630,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2801,6 +2818,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2996,6 +3014,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -3203,6 +3222,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3421,6 +3441,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -3653,13 +3674,15 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3839,12 +3862,14 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -4021,13 +4046,15 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -4207,12 +4234,14 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -4380,6 +4409,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -4557,6 +4587,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -4764,6 +4795,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
@@ -4941,6 +4973,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
@@ -5148,6 +5181,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -5325,6 +5359,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -5532,6 +5567,7 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
@@ -5709,6 +5745,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
@@ -5929,13 +5966,15 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
@@ -6110,13 +6149,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6292,13 +6333,15 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6485,12 +6528,14 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
@@ -6654,12 +6699,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6830,12 +6877,14 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7038,13 +7087,15 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -7327,13 +7378,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7625,13 +7678,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -7922,12 +7977,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
@@ -8200,12 +8257,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8487,12 +8546,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8763,12 +8824,14 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -8978,13 +9041,15 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9212,13 +9277,15 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9511,12 +9578,14 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9812,13 +9881,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -10155,13 +10226,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10508,13 +10581,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10861,12 +10936,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11203,12 +11280,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11535,13 +11614,15 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -11818,12 +11899,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -12104,12 +12187,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -12437,13 +12522,15 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12791,12 +12878,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13107,6 +13196,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13295,6 +13385,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13486,6 +13577,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13691,6 +13783,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13871,6 +13964,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14058,6 +14152,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14260,6 +14355,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14453,6 +14549,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14642,6 +14739,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
@@ -14830,6 +14928,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
@@ -15010,6 +15109,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -15198,6 +15298,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -15382,6 +15483,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15658,6 +15760,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15937,6 +16040,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16230,6 +16334,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16498,6 +16603,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16773,6 +16879,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -17063,6 +17170,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17344,6 +17452,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17621,6 +17730,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
@@ -17897,6 +18007,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
@@ -18165,6 +18276,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -18441,6 +18553,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index a3424793fdc4d..7d4a8b6480e30 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -25,6 +25,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -168,6 +169,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -317,6 +319,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -481,6 +484,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -622,6 +626,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -770,6 +775,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -945,13 +951,15 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1162,12 +1170,14 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1363,6 +1373,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1506,6 +1517,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -1653,6 +1665,7 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -1796,6 +1809,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -1945,6 +1959,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2109,6 +2124,7 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2250,6 +2266,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2398,6 +2415,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2573,13 +2591,15 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2790,12 +2810,14 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3006,13 +3028,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3160,13 +3184,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3319,13 +3345,15 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3492,12 +3520,14 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3641,12 +3671,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3797,12 +3829,14 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3967,13 +4001,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory:
@@ -4121,13 +4157,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4293,13 +4331,15 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4596,13 +4636,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4908,13 +4950,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5220,12 +5264,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5514,12 +5560,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5817,12 +5865,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6109,13 +6159,15 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6345,12 +6397,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6641,15 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6901,12 +6957,14 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7215,13 +7273,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7559,13 +7619,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7913,13 +7975,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8265,12 +8329,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8598,12 +8664,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8941,12 +9009,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9274,13 +9344,15 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9558,12 +9630,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9848,13 +9922,15 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10203,12 +10279,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10531,13 +10609,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10766,13 +10846,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11004,13 +11086,15 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11260,12 +11344,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11486,12 +11572,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11719,12 +11807,14 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12060,15 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12210,12 +12302,14 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12467,13 +12561,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12812,13 +12908,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13160,13 +13258,15 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13525,12 +13625,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13859,12 +13961,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14200,12 +14304,14 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14560,13 +14666,15 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14909,12 +15017,14 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 0d954e277cdd5..165c2c8f4165f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -25,6 +25,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -168,6 +169,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -317,6 +319,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -481,6 +484,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -622,6 +626,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -770,6 +775,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -945,13 +951,15 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1162,12 +1170,14 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -1363,6 +1373,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1506,6 +1517,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -1653,6 +1665,7 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -1796,6 +1809,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -1945,6 +1959,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2109,6 +2124,7 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2250,6 +2266,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2398,6 +2415,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2573,13 +2591,15 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2790,12 +2810,14 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3006,13 +3028,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -3160,13 +3184,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3319,13 +3345,15 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3492,12 +3520,14 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -3641,12 +3671,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -3797,12 +3829,14 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -3967,13 +4001,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory:
@@ -4121,13 +4157,15 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -4293,13 +4331,15 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4596,13 +4636,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -4908,13 +4950,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5220,12 +5264,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory:
@@ -5514,12 +5560,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5817,12 +5865,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -6109,13 +6159,15 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -6345,12 +6397,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -6587,13 +6641,15 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -6901,12 +6957,14 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7215,13 +7273,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -7559,13 +7619,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -7913,13 +7975,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -8265,12 +8329,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory:
@@ -8598,12 +8664,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8941,12 +9009,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -9274,13 +9344,15 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -9558,12 +9630,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
@@ -9848,13 +9922,15 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10203,12 +10279,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -10531,13 +10609,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -10766,13 +10846,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11004,13 +11086,15 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11260,12 +11344,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -11486,12 +11572,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -11719,12 +11807,14 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -11970,13 +12060,15 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12210,12 +12302,14 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12467,13 +12561,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -12812,13 +12908,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13160,13 +13258,15 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13525,12 +13625,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -13859,12 +13961,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB58_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14200,12 +14304,14 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB59_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14560,13 +14666,15 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB60_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14909,12 +15017,14 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB61_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 47eb89eed9019..842f30a32f22a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -34,13 +34,15 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32:
@@ -229,13 +231,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -428,13 +432,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -644,12 +650,14 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32:
@@ -829,12 +837,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1021,12 +1031,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1232,13 +1244,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1432,12 +1446,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1631,13 +1647,15 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__ftz:
@@ -1826,13 +1844,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2025,13 +2045,15 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2241,12 +2263,14 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__ftz:
@@ -2426,12 +2450,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -2618,12 +2644,14 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -2829,13 +2857,15 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3029,12 +3059,14 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3228,13 +3260,15 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64:
@@ -3439,13 +3473,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -3651,13 +3687,15 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -3877,12 +3915,14 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64:
@@ -4072,12 +4112,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4274,12 +4316,14 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -4513,13 +4557,15 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16:
@@ -4802,13 +4848,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5100,13 +5148,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -5397,12 +5447,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16:
@@ -5675,12 +5727,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -5962,12 +6016,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -6239,13 +6295,15 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -6460,12 +6518,14 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -6687,13 +6747,15 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -6986,12 +7048,14 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -7287,13 +7351,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16:
@@ -7630,13 +7696,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -7983,13 +8051,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -8334,12 +8404,14 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16:
@@ -8666,12 +8738,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -9008,12 +9082,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -9340,13 +9416,15 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -9623,12 +9701,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -9912,13 +9992,15 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10266,12 +10348,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10591,13 +10675,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16:
@@ -10809,13 +10895,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -11030,13 +11118,15 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -11268,12 +11358,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16:
@@ -11475,12 +11567,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -11689,12 +11783,14 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -11922,13 +12018,15 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12144,12 +12242,14 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -12385,13 +12485,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16:
@@ -12730,13 +12832,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -13078,13 +13182,15 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -13443,12 +13549,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16:
@@ -13777,12 +13885,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -14118,12 +14228,14 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -14478,13 +14590,15 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -14827,12 +14941,14 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 14d8b71c5167a..baee2c7c839a4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -265,10 +265,11 @@ define void @zero_init_foo() {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -276,6 +277,7 @@ define void @zero_init_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_foo:
@@ -354,10 +356,11 @@ define void @zero_init_foo() {
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
@@ -365,6 +368,7 @@ define void @zero_init_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i16], align 2, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
@@ -903,6 +907,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_foo:
@@ -975,6 +980,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
@@ -1018,6 +1024,7 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: private_ptr_foo:
@@ -1059,6 +1066,7 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -1351,10 +1359,11 @@ define void @zero_init_small_offset_foo() {
; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -1362,6 +1371,7 @@ define void @zero_init_small_offset_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_small_offset_foo:
@@ -1450,10 +1460,11 @@ define void @zero_init_small_offset_foo() {
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
@@ -1461,6 +1472,7 @@ define void @zero_init_small_offset_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -2158,6 +2170,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -2243,6 +2256,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -2562,10 +2576,11 @@ define void @zero_init_large_offset_foo() {
; GFX12-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 s1, s0
; GFX12-NEXT: s_mov_b32 s2, s0
; GFX12-NEXT: s_mov_b32 s3, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_clause 0x3
@@ -2573,6 +2588,7 @@ define void @zero_init_large_offset_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_large_offset_foo:
@@ -2702,10 +2718,11 @@ define void @zero_init_large_offset_foo() {
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
; GFX12-PAL-NEXT: s_mov_b32 s0, 0
-; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_mov_b32 s1, s0
; GFX12-PAL-NEXT: s_mov_b32 s2, s0
; GFX12-PAL-NEXT: s_mov_b32 s3, s0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-PAL-NEXT: s_clause 0x3
@@ -2713,6 +2730,7 @@ define void @zero_init_large_offset_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -3414,6 +3432,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3501,6 +3520,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3750,6 +3770,7 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3826,6 +3847,7 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
@@ -4039,6 +4061,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i64_aligned:
@@ -4097,6 +4120,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -4150,6 +4174,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i64_unaligned:
@@ -4208,6 +4233,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -4265,6 +4291,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
@@ -4328,6 +4355,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -4387,6 +4415,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
@@ -4453,6 +4482,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
@@ -4505,6 +4535,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
@@ -4573,6 +4604,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
@@ -4627,6 +4659,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
@@ -4696,6 +4729,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
+; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 3b7009023b03a..04cd150d93176 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_pk_maximum_f16 v0, s0, s2
; GFX12-GISEL-NEXT: s_maximum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 9ce1ba3316dd5..745180f242afa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -11,6 +11,7 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32:
@@ -38,6 +39,7 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_commute:
@@ -93,6 +95,7 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs0:
@@ -121,6 +124,7 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs1:
@@ -149,6 +153,7 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs2:
@@ -177,6 +182,7 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
@@ -207,6 +213,7 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
@@ -237,6 +244,7 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
@@ -270,6 +278,7 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg0:
@@ -298,6 +307,7 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg1:
@@ -326,6 +336,7 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg2:
@@ -354,6 +365,7 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_const0:
@@ -381,6 +393,7 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32__const2:
@@ -408,6 +421,7 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
@@ -435,6 +449,7 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
@@ -462,8 +477,9 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
@@ -492,6 +508,7 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v4, v0, v2
; GFX12-NEXT: v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32:
@@ -526,6 +543,7 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, v4
; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32_commute:
@@ -560,6 +578,7 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v2|, |v4|
; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
@@ -597,6 +616,7 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v2, -v4
; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
@@ -634,6 +654,7 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v2
; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
@@ -668,6 +689,7 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, 4.0
; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
@@ -703,6 +725,7 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
; GFX12-NEXT: v_maximum3_f32 v0, v6, v0, v3
; GFX12-NEXT: v_maximum3_f32 v1, v7, v1, v4
; GFX12-NEXT: v_maximum3_f32 v2, v8, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32:
@@ -744,6 +767,7 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, v6
; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, v7
; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32_commute:
@@ -785,6 +809,7 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v3|, |v6|
; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v4|, |v7|
; GFX12-NEXT: v_maximum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
@@ -829,6 +854,7 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v3, -v6
; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v4, -v7
; GFX12-NEXT: v_maximum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
@@ -873,6 +899,7 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v3
; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v4
; GFX12-NEXT: v_maximum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
@@ -914,6 +941,7 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, 4.0
; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, 4.0
; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
@@ -954,6 +982,7 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16:
@@ -981,6 +1010,7 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_commute:
@@ -1040,6 +1070,7 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs0:
@@ -1068,6 +1099,7 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs1:
@@ -1096,6 +1128,7 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs2:
@@ -1124,6 +1157,7 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
@@ -1154,6 +1188,7 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
@@ -1184,6 +1219,7 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
@@ -1217,6 +1253,7 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg0:
@@ -1245,6 +1282,7 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg1:
@@ -1273,6 +1311,7 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg2:
@@ -1301,6 +1340,7 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_const0:
@@ -1328,6 +1368,7 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16__const2:
@@ -1355,6 +1396,7 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
@@ -1382,6 +1424,7 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
@@ -1409,8 +1452,9 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_movk_i32 s0, 0x4800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
@@ -1440,6 +1484,7 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16:
@@ -1478,6 +1523,7 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16_commute:
@@ -1519,6 +1565,7 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
@@ -1563,6 +1610,7 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
@@ -1604,6 +1652,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
@@ -1642,6 +1691,7 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
@@ -1682,6 +1732,7 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16:
@@ -1732,6 +1783,7 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16_commute:
@@ -1789,6 +1841,7 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all:
@@ -1848,6 +1901,7 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all:
@@ -1901,6 +1955,7 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1:
@@ -1949,6 +2004,7 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2:
@@ -1999,6 +2055,7 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16:
@@ -2053,6 +2110,7 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16_commute:
@@ -2114,6 +2172,7 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__fabs_all:
@@ -2177,6 +2236,7 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__fneg_all:
@@ -2234,6 +2294,7 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm1:
@@ -2288,6 +2349,7 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm2:
@@ -2340,6 +2402,7 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64:
@@ -2371,6 +2434,7 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_commute:
@@ -2441,6 +2505,7 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs0:
@@ -2473,6 +2538,7 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs1:
@@ -2505,6 +2571,7 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs2:
@@ -2537,6 +2604,7 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
@@ -2571,6 +2639,7 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg_all:
@@ -2605,6 +2674,7 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
@@ -2642,6 +2712,7 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg0:
@@ -2674,6 +2745,7 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg1:
@@ -2706,6 +2778,7 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg2:
@@ -2738,6 +2811,7 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_const0:
@@ -2771,6 +2845,7 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64__const2:
@@ -2804,6 +2879,7 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_inlineimm0:
@@ -2835,6 +2911,7 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64__inlineimm:
@@ -2866,6 +2943,7 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
@@ -2901,6 +2979,7 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v1, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
@@ -2965,6 +3044,7 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v1, v0, v2
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
@@ -3035,6 +3115,7 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
@@ -3074,6 +3155,7 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 817e6dd87361f..3271758f71297 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -174,7 +174,8 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_pk_minimum_f16 v0, s0, s2
; GFX12-GISEL-NEXT: s_minimum_f16 s0, s1, s3
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 21074d58bdb7e..de63b99e9139c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -11,6 +11,7 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32:
@@ -38,6 +39,7 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v2, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_commute:
@@ -93,6 +95,7 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs0:
@@ -121,6 +124,7 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs1:
@@ -149,6 +153,7 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs2:
@@ -177,6 +182,7 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs_all:
@@ -207,6 +213,7 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg_all:
@@ -237,6 +244,7 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
@@ -270,6 +278,7 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg0:
@@ -298,6 +307,7 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg1:
@@ -326,6 +336,7 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg2:
@@ -354,6 +365,7 @@ define float @v_fminimum3_f32_const0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_const0:
@@ -381,6 +393,7 @@ define float @v_fminimum3_f32__const2(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32__const2:
@@ -408,6 +421,7 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
@@ -435,6 +449,7 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32__inlineimm:
@@ -462,8 +477,9 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_const1_const2:
@@ -492,6 +508,7 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v4, v0, v2
; GFX12-NEXT: v_minimum3_f32 v1, v5, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32:
@@ -526,6 +543,7 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, v4
; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32_commute:
@@ -560,6 +578,7 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v2|, |v4|
; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
@@ -597,6 +616,7 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v2, -v4
; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
@@ -634,6 +654,7 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v2
; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
@@ -668,6 +689,7 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, 4.0
; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
@@ -703,6 +725,7 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
; GFX12-NEXT: v_minimum3_f32 v0, v6, v0, v3
; GFX12-NEXT: v_minimum3_f32 v1, v7, v1, v4
; GFX12-NEXT: v_minimum3_f32 v2, v8, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32:
@@ -744,6 +767,7 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, v6
; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, v7
; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32_commute:
@@ -785,6 +809,7 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v3|, |v6|
; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v4|, |v7|
; GFX12-NEXT: v_minimum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
@@ -829,6 +854,7 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v3, -v6
; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v4, -v7
; GFX12-NEXT: v_minimum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
@@ -873,6 +899,7 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v3
; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v4
; GFX12-NEXT: v_minimum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
@@ -914,6 +941,7 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, 4.0
; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, 4.0
; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
@@ -954,6 +982,7 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16:
@@ -981,6 +1010,7 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_commute:
@@ -1040,6 +1070,7 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs0:
@@ -1068,6 +1099,7 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs1:
@@ -1096,6 +1128,7 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs2:
@@ -1124,6 +1157,7 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs_all:
@@ -1154,6 +1188,7 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg_all:
@@ -1184,6 +1219,7 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
@@ -1217,6 +1253,7 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg0:
@@ -1245,6 +1282,7 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg1:
@@ -1273,6 +1311,7 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg2:
@@ -1301,6 +1340,7 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_const0:
@@ -1328,6 +1368,7 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16__const2:
@@ -1355,6 +1396,7 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
@@ -1382,6 +1424,7 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16__inlineimm:
@@ -1409,8 +1452,9 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_movk_i32 s0, 0x4800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_const1_const2:
@@ -1440,6 +1484,7 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v2, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16:
@@ -1478,6 +1523,7 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16_commute:
@@ -1519,6 +1565,7 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__fabs_all:
@@ -1563,6 +1610,7 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__fneg_all:
@@ -1604,6 +1652,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1:
@@ -1642,6 +1691,7 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2:
@@ -1682,6 +1732,7 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16:
@@ -1732,6 +1783,7 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16_commute:
@@ -1789,6 +1841,7 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__fabs_all:
@@ -1848,6 +1901,7 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__fneg_all:
@@ -1901,6 +1955,7 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1:
@@ -1949,6 +2004,7 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2:
@@ -1999,6 +2055,7 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16:
@@ -2053,6 +2110,7 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16_commute:
@@ -2114,6 +2172,7 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__fabs_all:
@@ -2177,6 +2236,7 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__fneg_all:
@@ -2234,6 +2294,7 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__inlineimm1:
@@ -2288,6 +2349,7 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__inlineimm2:
@@ -2340,6 +2402,7 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64:
@@ -2371,6 +2434,7 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_commute:
@@ -2441,6 +2505,7 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs0:
@@ -2473,6 +2538,7 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs1:
@@ -2505,6 +2571,7 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs2:
@@ -2537,6 +2604,7 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs_all:
@@ -2571,6 +2639,7 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg_all:
@@ -2605,6 +2674,7 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
@@ -2642,6 +2712,7 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg0:
@@ -2674,6 +2745,7 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg1:
@@ -2706,6 +2778,7 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg2:
@@ -2738,6 +2811,7 @@ define double @v_fminimum3_f64_const0(double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_const0:
@@ -2771,6 +2845,7 @@ define double @v_fminimum3_f64__const2(double %a, double %b) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64__const2:
@@ -2804,6 +2879,7 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_inlineimm0:
@@ -2835,6 +2911,7 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64__inlineimm:
@@ -2866,6 +2943,7 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_const1_const2:
@@ -2901,6 +2979,7 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v1, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f32__multi_use:
@@ -2965,6 +3044,7 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v1, v0, v2
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
@@ -3035,6 +3115,7 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v1, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use:
@@ -3074,6 +3155,7 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[2:3], v[0:1], v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f64__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 1914b74be1909..d5159adcd4f02 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -78,6 +78,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
@@ -92,6 +93,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
@@ -110,6 +112,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
@@ -124,6 +127,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -161,6 +165,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
@@ -172,6 +177,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -209,6 +215,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
@@ -220,6 +227,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -259,6 +267,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
@@ -270,6 +279,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -284,6 +294,7 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
@@ -294,6 +305,7 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
@@ -310,6 +322,7 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
@@ -321,6 +334,7 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 95d8ca391b843..f2d97fbf6ba9c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -64,8 +64,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -104,8 +105,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -131,6 +133,7 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
ret float %ret
@@ -168,13 +171,15 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret float %ret
@@ -220,6 +225,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -265,6 +271,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -311,6 +318,7 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -361,6 +369,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
@@ -411,6 +420,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 5c4ded9a231e0..dfc03df40534a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -26,6 +26,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32:
@@ -197,6 +198,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
@@ -370,6 +372,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
@@ -553,6 +556,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32:
@@ -706,6 +710,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
@@ -862,6 +867,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
@@ -1035,13 +1041,15 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
@@ -1255,12 +1263,14 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
@@ -1463,6 +1473,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
@@ -1634,6 +1645,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
@@ -1807,6 +1819,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
@@ -1990,6 +2003,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
@@ -2143,6 +2157,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
@@ -2299,6 +2314,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
@@ -2472,13 +2488,15 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
@@ -2692,12 +2710,14 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
@@ -2909,13 +2929,15 @@ define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64:
@@ -3130,13 +3152,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
@@ -3352,13 +3376,15 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
@@ -3581,12 +3607,14 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64:
@@ -3784,12 +3812,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
@@ -3990,12 +4020,14 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
@@ -4223,13 +4255,15 @@ define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16:
@@ -4562,13 +4596,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
@@ -4912,13 +4948,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
@@ -5261,12 +5299,14 @@ define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16:
@@ -5588,12 +5628,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
@@ -5925,12 +5967,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
@@ -6252,13 +6296,15 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
@@ -6512,12 +6558,14 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
@@ -6776,13 +6824,15 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
@@ -7127,12 +7177,14 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
@@ -7478,13 +7530,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16:
@@ -7871,13 +7925,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
@@ -8276,13 +8332,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
@@ -8679,12 +8737,14 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16:
@@ -9060,12 +9120,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
@@ -9452,12 +9514,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
@@ -9834,13 +9898,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
@@ -10156,12 +10222,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
@@ -10482,13 +10550,15 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
@@ -10888,12 +10958,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
@@ -11254,6 +11326,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11485,6 +11558,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11718,6 +11792,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -11955,6 +12030,7 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -12164,6 +12240,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12376,6 +12453,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12596,6 +12674,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -12831,6 +12910,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -13062,6 +13142,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
@@ -13391,6 +13472,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -13722,6 +13804,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -14057,6 +14140,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
@@ -14376,6 +14460,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -14698,6 +14783,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -15028,6 +15114,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -15361,6 +15448,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -15681,6 +15769,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -15689,6 +15778,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 4f7b6164936f8..da1494974cb57 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -26,6 +26,7 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -188,6 +189,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -352,6 +354,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -516,6 +519,7 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -672,6 +676,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -831,6 +836,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1001,13 +1007,15 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos:
@@ -1254,12 +1262,14 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos:
@@ -1492,6 +1502,7 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -1654,6 +1665,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -1818,6 +1830,7 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -1982,6 +1995,7 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2138,6 +2152,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2297,6 +2312,7 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
@@ -2467,13 +2483,15 @@ define float @global_system_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -2720,12 +2738,14 @@ define void @global_system_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2969,13 +2989,15 @@ define double @global_agent_atomic_fmax_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64:
@@ -3141,13 +3163,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_pos:
@@ -3314,13 +3338,15 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__offset12b_neg:
@@ -3487,12 +3513,14 @@ define void @global_agent_atomic_fmax_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64:
@@ -3649,12 +3677,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_pos:
@@ -3814,12 +3844,14 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__offset12b_neg:
@@ -3997,13 +4029,15 @@ define half @global_agent_atomic_fmax_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16:
@@ -4350,13 +4384,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos:
@@ -4714,13 +4750,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg:
@@ -5078,12 +5116,14 @@ define void @global_agent_atomic_fmax_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16:
@@ -5421,12 +5461,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos:
@@ -5774,12 +5816,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg:
@@ -6116,13 +6160,15 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4:
@@ -6391,12 +6437,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos:
@@ -6670,13 +6718,15 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos:
@@ -7036,12 +7086,14 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos:
@@ -7400,13 +7452,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16:
@@ -7795,13 +7849,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos:
@@ -8202,13 +8258,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg:
@@ -8607,12 +8665,14 @@ define void @global_agent_atomic_fmax_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16:
@@ -8990,12 +9050,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos:
@@ -9384,12 +9446,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg:
@@ -9768,13 +9832,15 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4:
@@ -10092,12 +10158,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos:
@@ -10420,13 +10488,15 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos:
@@ -10828,12 +10898,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos:
@@ -11207,13 +11279,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16:
@@ -11499,13 +11573,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_pos:
@@ -11793,13 +11869,15 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2f16__offset12b_neg:
@@ -12091,12 +12169,14 @@ define void @global_agent_atomic_fmax_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16:
@@ -12372,12 +12452,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_pos:
@@ -12656,12 +12738,14 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2f16__offset12b_neg:
@@ -12948,13 +13032,15 @@ define <2 x half> @global_system_atomic_fmax_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_v2f16__offset12b_pos:
@@ -13244,12 +13330,14 @@ define void @global_system_atomic_fmax_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_v2f16__offset12b_pos:
@@ -13552,13 +13640,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16:
@@ -13950,13 +14040,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_pos:
@@ -14350,13 +14442,15 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_v2bf16__offset12b_neg:
@@ -14753,12 +14847,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16:
@@ -15138,12 +15234,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_pos:
@@ -15526,12 +15624,14 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_v2bf16__offset12b_neg:
@@ -15923,13 +16023,15 @@ define <2 x bfloat> @global_system_atomic_fmax_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_ret_v2bf16__offset12b_pos:
@@ -16324,12 +16426,14 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmax_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 591e01b11bd24..7609f51de5fbb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -26,6 +26,7 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -188,6 +189,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -352,6 +354,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -516,6 +519,7 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -672,6 +676,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -831,6 +836,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1001,13 +1007,15 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos:
@@ -1254,12 +1262,14 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos:
@@ -1492,6 +1502,7 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -1654,6 +1665,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -1818,6 +1830,7 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -1982,6 +1995,7 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2138,6 +2152,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2297,6 +2312,7 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
@@ -2467,13 +2483,15 @@ define float @global_system_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -2720,12 +2738,14 @@ define void @global_system_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2969,13 +2989,15 @@ define double @global_agent_atomic_fmin_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64:
@@ -3141,13 +3163,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_pos:
@@ -3314,13 +3338,15 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f64__offset12b_neg:
@@ -3487,12 +3513,14 @@ define void @global_agent_atomic_fmin_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64:
@@ -3649,12 +3677,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_pos:
@@ -3814,12 +3844,14 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f64__offset12b_neg:
@@ -3997,13 +4029,15 @@ define half @global_agent_atomic_fmin_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16:
@@ -4350,13 +4384,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos:
@@ -4714,13 +4750,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg:
@@ -5078,12 +5116,14 @@ define void @global_agent_atomic_fmin_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16:
@@ -5421,12 +5461,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos:
@@ -5774,12 +5816,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg:
@@ -6116,13 +6160,15 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4:
@@ -6391,12 +6437,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos:
@@ -6670,13 +6718,15 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos:
@@ -7036,12 +7086,14 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos:
@@ -7400,13 +7452,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16:
@@ -7795,13 +7849,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos:
@@ -8202,13 +8258,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg:
@@ -8607,12 +8665,14 @@ define void @global_agent_atomic_fmin_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16:
@@ -8990,12 +9050,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos:
@@ -9384,12 +9446,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg:
@@ -9768,13 +9832,15 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4:
@@ -10092,12 +10158,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos:
@@ -10420,13 +10488,15 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos:
@@ -10828,12 +10898,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos:
@@ -11207,13 +11279,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16:
@@ -11499,13 +11573,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_pos:
@@ -11793,13 +11869,15 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2f16__offset12b_neg:
@@ -12091,12 +12169,14 @@ define void @global_agent_atomic_fmin_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16:
@@ -12372,12 +12452,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_pos:
@@ -12656,12 +12738,14 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2f16__offset12b_neg:
@@ -12948,13 +13032,15 @@ define <2 x half> @global_system_atomic_fmin_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_v2f16__offset12b_pos:
@@ -13244,12 +13330,14 @@ define void @global_system_atomic_fmin_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_v2f16__offset12b_pos:
@@ -13552,13 +13640,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16:
@@ -13950,13 +14040,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_pos:
@@ -14350,13 +14442,15 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_v2bf16__offset12b_neg:
@@ -14753,12 +14847,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16:
@@ -15138,12 +15234,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_pos:
@@ -15526,12 +15624,14 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_v2bf16__offset12b_neg:
@@ -15923,13 +16023,15 @@ define <2 x bfloat> @global_system_atomic_fmin_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_ret_v2bf16__offset12b_pos:
@@ -16324,12 +16426,14 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fmin_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 8e58f309dd9ae..2332118fc30e3 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32:
@@ -266,13 +268,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos:
@@ -499,13 +503,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg:
@@ -741,12 +747,14 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32:
@@ -961,12 +969,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos:
@@ -1184,12 +1194,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg:
@@ -1416,13 +1428,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos:
@@ -1650,12 +1664,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos:
@@ -1880,13 +1896,15 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__ftz:
@@ -2111,13 +2129,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -2344,13 +2364,15 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz:
@@ -2586,12 +2608,14 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__ftz:
@@ -2806,12 +2830,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3029,12 +3055,14 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz:
@@ -3261,13 +3289,15 @@ define float @global_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f32__offset12b_pos__ftz:
@@ -3495,12 +3525,14 @@ define void @global_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f32__offset12b_pos__ftz:
@@ -3725,13 +3757,15 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64:
@@ -3976,13 +4010,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_pos:
@@ -4228,13 +4264,15 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f64__offset12b_neg:
@@ -4487,12 +4525,14 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64:
@@ -4716,12 +4756,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_pos:
@@ -4948,12 +4990,14 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f64__offset12b_neg:
@@ -5207,13 +5251,15 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16:
@@ -5546,13 +5592,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5896,13 +5944,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -6245,12 +6295,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16:
@@ -6572,12 +6624,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos:
@@ -6909,12 +6963,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg:
@@ -7236,13 +7292,15 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -7496,12 +7554,14 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos:
@@ -7760,13 +7820,15 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -8111,12 +8173,14 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB31_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos:
@@ -8462,13 +8526,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB32_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -8855,13 +8921,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB33_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -9260,13 +9328,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -9663,12 +9733,14 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB35_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16:
@@ -10044,12 +10116,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB36_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos:
@@ -10436,12 +10510,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB37_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg:
@@ -10818,13 +10894,15 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -11140,12 +11218,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos:
@@ -11466,13 +11546,15 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -11872,12 +11954,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB41_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos:
@@ -12247,13 +12331,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB42_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16:
@@ -12522,13 +12608,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB43_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -12799,13 +12887,15 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB44_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2f16__offset12b_neg:
@@ -13079,12 +13169,14 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16:
@@ -13341,12 +13433,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -13606,12 +13700,14 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2f16__offset12b_neg:
@@ -13880,13 +13976,15 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_v2f16__offset12b_pos:
@@ -14158,12 +14256,14 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_v2f16__offset12b_pos:
@@ -14450,13 +14550,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16:
@@ -14848,13 +14950,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -15248,13 +15352,15 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_ret_v2bf16__offset12b_neg:
@@ -15651,12 +15757,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16:
@@ -16036,12 +16144,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_pos:
@@ -16424,12 +16534,14 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fsub_noret_v2bf16__offset12b_neg:
@@ -16821,13 +16933,15 @@ define <2 x bfloat> @global_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB56_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_ret_v2bf16__offset12b_pos:
@@ -17222,12 +17336,14 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB57_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fsub_noret_v2bf16__offset12b_pos:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 40f0acf3d5d09..0ff3d77d85303 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -280,7 +280,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -571,7 +571,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -864,7 +864,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1155,7 +1155,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1448,7 +1448,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1739,7 +1739,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -2014,7 +2014,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2287,7 +2287,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2562,7 +2562,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -2835,7 +2835,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3110,7 +3110,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3383,7 +3383,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3658,7 +3658,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -3931,7 +3931,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
@@ -4224,7 +4224,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4515,7 +4515,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4920,7 +4920,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5211,7 +5211,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5504,7 +5504,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -5795,7 +5795,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6102,7 +6102,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV
@@ -6196,7 +6196,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -6427,7 +6427,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV
@@ -6518,7 +6518,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
new file mode 100644
index 0000000000000..4aa49f2c9296d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=gfx1201 %s -o - | FileCheck %s
+
+define amdgpu_kernel void @foo() {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b64 s[0:1], src_shared_base
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; CHECK-NEXT: flat_store_b64 v[0:1], v[2:3]
+; CHECK-NEXT: s_endpgm
+entry:
+ br label %bb1
+
+bb0:
+ br label %bb1
+
+bb1:
+ %dst = phi ptr [ null, %bb0 ], [ addrspacecast (ptr addrspace(3) null to ptr), %entry ]
+ store i64 0, ptr %dst, align 16
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index eb4cba35e9946..c63d6e99a1040 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -44,15 +44,19 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-LABEL: indirect_call_known_no_special_inputs:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_getpc_b64 s[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_add_co_u32 s6, s6, snork at gotpcrel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork at gotpcrel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s6, s6, snork at gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s7, s7, snork at gotpcrel32@hi+24
; GFX12-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX12-NEXT: s_mov_b64 s[4:5], 0
; GFX12-NEXT: s_getpc_b64 s[8:9]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s9, s9
-; GFX12-NEXT: s_add_co_u32 s8, s8, wobble at gotpcrel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble at gotpcrel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s8, s8, wobble at gotpcrel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s9, s9, wobble at gotpcrel32@hi+24
; GFX12-NEXT: s_load_u8 s12, s[4:5], 0x0
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
; GFX12-NEXT: s_load_b64 s[6:7], s[8:9], 0x0
@@ -61,12 +65,13 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
; GFX12-NEXT: s_mov_b32 s32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s8, 1, s12
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_eq_u32 s8, 1
; GFX12-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-NEXT: s_cselect_b32 s7, s7, s5
; GFX12-NEXT: s_cselect_b32 s6, s6, s4
; GFX12-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-NEXT: s_endpgm
@@ -90,6 +95,7 @@ define void @wobble() {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
@@ -108,6 +114,7 @@ define void @snork() {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index b9dc27cb7e019..8b56c6040614b 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -128,7 +128,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -137,12 +136,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret void
@@ -297,13 +298,15 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
ret i32 %result
@@ -416,15 +419,18 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
; GFX12-NEXT: scratch_load_b32 v32, off, s32
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
-; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg at rel32@lo+8
-; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg at rel32@hi+16
+; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg at rel32@lo+12
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg at rel32@hi+24
; GFX12-NEXT: scratch_store_b32 off, v32, s32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[0:1]
entry:
%alloca = alloca double, align 8, addrspace(5)
@@ -599,28 +605,37 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cvt_f32_u32 s4, s3
; GFX12-NEXT: s_sub_co_i32 s5, 0, s3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cvt_u32_f32 s4, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_i32 s5, s5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_i32 s4, s4, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s5, s4, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sub_co_i32 s2, s2, s5
; GFX12-NEXT: s_add_co_i32 s5, s4, 1
; GFX12-NEXT: s_sub_co_i32 s6, s2, s3
; GFX12-NEXT: s_cmp_ge_u32 s2, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cselect_b32 s4, s5, s4
; GFX12-NEXT: s_cselect_b32 s2, s6, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_co_i32 s5, s4, 1
; GFX12-NEXT: s_cmp_ge_u32 s2, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cselect_b32 s2, s5, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -797,6 +812,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -805,8 +821,9 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s0, s0, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_u32 v0, v1
@@ -882,6 +899,7 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
ret void
@@ -1043,6 +1061,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1052,8 +1071,9 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
@@ -1227,6 +1247,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_mov_b32 s0, exec_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1236,8 +1257,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, s1
; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 9f093cc7b5abf..203af74183ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -196,6 +196,7 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32:
@@ -216,6 +217,7 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add i32 %x, 1
@@ -389,6 +391,7 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16:
@@ -410,6 +413,7 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
@@ -605,6 +609,7 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -927,6 +932,7 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -954,6 +960,7 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -1362,6 +1369,7 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1389,6 +1397,7 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -1556,6 +1565,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16:
@@ -1577,6 +1587,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
@@ -1772,6 +1783,7 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -2094,6 +2106,7 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_v3i16:
@@ -2121,6 +2134,7 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -2529,6 +2543,7 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_v4i16:
@@ -2556,6 +2571,7 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -2841,6 +2857,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32:
@@ -2871,6 +2888,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i32> %x, <i32 1, i32 1>
@@ -3259,6 +3277,7 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[5:6]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v3, v[6:7]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i32:
@@ -3300,6 +3319,7 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i32> %x, <i32 1, i32 1, i32 1>
@@ -3749,6 +3769,7 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[7:8]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v8, v4, v[8:9]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i32:
@@ -3798,6 +3819,7 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v3, v6, v3
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
@@ -4008,6 +4030,7 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i24:
@@ -4030,6 +4053,7 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%shl = shl i32 %x, 8
@@ -4244,6 +4268,7 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_u24:
@@ -4266,6 +4291,7 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%shl = and i32 %x, 16777215
@@ -4441,6 +4467,7 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8:
@@ -4462,6 +4489,7 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i8 %x, 1
@@ -4754,6 +4782,7 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX1200-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -4785,6 +4814,7 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i8> %x, <i8 1, i8 1>
@@ -5244,6 +5274,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64:
@@ -5288,6 +5319,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3]
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2]
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add i64 %x, 1
@@ -6114,6 +6146,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v9, v1, v4
; GFX1200-SDAG-NEXT: v_add3_u32 v3, v10, v3, v6
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64:
@@ -6188,6 +6221,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v8, v10
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5]
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i64> %x, <i64 1, i64 1>
@@ -6445,6 +6479,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1200-NEXT: global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v5, v0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
bb:
%i = add i32 %arg, 1
@@ -6678,6 +6713,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1200-NEXT: global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v5, v1
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
bb:
%i = add i32 %arg, 1
@@ -6976,6 +7012,7 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32_x2:
@@ -7007,6 +7044,7 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add i32 %x, 1
@@ -7526,6 +7564,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
@@ -7578,6 +7617,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i32> %x, <i32 1, i32 1>
@@ -7849,6 +7889,7 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2:
@@ -7880,6 +7921,7 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
@@ -8145,6 +8187,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2:
@@ -8176,6 +8219,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
@@ -8499,6 +8543,7 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -8822,6 +8867,7 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -8897,6 +8943,7 @@ define <2 x i32> @multi_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z0, i32 %z1) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v1, v2
; GFX1200-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i32 %x, %y
@@ -9012,6 +9059,7 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var:
@@ -9028,6 +9076,7 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
@@ -9099,6 +9148,7 @@ define i32 @other_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z, ptr addrspace(3) %
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v1, v2
; GFX1200-NEXT: ds_store_b32 v3, v1
; GFX1200-NEXT: s_wait_dscnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i32 %x, %y
@@ -9199,6 +9249,7 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2
; GFX1200-SDAG-NEXT: ds_store_b16 v3, v4
; GFX1200-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: other_use_mul_mad_i16_var:
@@ -9213,6 +9264,7 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v1, v2
; GFX1200-GISEL-NEXT: ds_store_b16 v3, v1
; GFX1200-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
@@ -9330,6 +9382,7 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX1200-NEXT: v_pk_mad_u16 v1, v0, v1, v3
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-NEXT: v_mov_b32_e32 v0, v2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul <2 x i16> %x, %y
@@ -9457,6 +9510,7 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX1200-NEXT: v_pk_mad_u16 v0, v0, v1, v2
; GFX1200-NEXT: ds_store_b32 v3, v4
; GFX1200-NEXT: s_wait_dscnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul <2 x i16> %x, %y
@@ -9540,6 +9594,7 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: mul_u24_add64:
@@ -9554,6 +9609,7 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
%mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
%add = add i64 %mul, %z
@@ -9613,6 +9669,7 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1200-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
%mul.zext = zext i32 %mul to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
index 9445f1225e0cb..67c890c279432 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
@@ -12,6 +12,7 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -30,6 +31,7 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -46,6 +48,7 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -64,6 +67,7 @@ define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc,
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -83,6 +87,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -100,6 +105,7 @@ define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inre
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
@@ -117,6 +123,7 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -135,6 +142,7 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -151,6 +159,7 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -169,6 +178,7 @@ define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsr
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -188,6 +198,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %r
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
@@ -205,6 +216,7 @@ define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> i
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: s_mov_b32 s4, 4
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
index de484e3db18ab..050cbb544e5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
@@ -12,6 +12,7 @@ define float @raw_buffer_load(<4 x i32> inreg) {
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: buffer_load_b32 v0, off, s[0:3], null th:TH_LOAD_LU
; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
main_body:
%data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 3)
@@ -29,6 +30,7 @@ define float @struct_buffer_load(<4 x i32> inreg) {
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_load_b32 v0, v0, s[0:3], null idxen th:TH_LOAD_LU
; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
main_body:
%data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index d3fc96d7ff801..9eb747ebe7149 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -28,6 +28,7 @@ define float @test_cvt_f32_bf8_byte0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
ret float %ret
@@ -48,6 +49,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
ret float %ret
@@ -68,6 +70,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
ret float %ret
@@ -88,6 +91,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
ret float %ret
@@ -108,6 +112,7 @@ define float @test_cvt_f32_fp8_byte0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
ret float %ret
@@ -128,6 +133,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
ret float %ret
@@ -148,6 +154,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
ret float %ret
@@ -168,6 +175,7 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
ret float %ret
@@ -188,6 +196,7 @@ define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
ret <2 x float> %ret
@@ -208,6 +217,7 @@ define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
ret <2 x float> %ret
@@ -228,6 +238,7 @@ define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
ret <2 x float> %ret
@@ -248,6 +259,7 @@ define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
ret <2 x float> %ret
@@ -271,6 +283,7 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
@@ -295,6 +308,7 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
@@ -318,6 +332,7 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
@@ -342,6 +357,7 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
@@ -365,6 +381,7 @@ define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
ret i32 %ret
@@ -388,6 +405,7 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
ret i32 %ret
@@ -412,6 +430,7 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
ret i32 %ret
@@ -436,6 +455,7 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
ret i32 %ret
@@ -459,6 +479,7 @@ define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
ret i32 %ret
@@ -482,6 +503,7 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
ret i32 %ret
@@ -506,6 +528,7 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
ret i32 %ret
@@ -530,6 +553,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
ret i32 %ret
@@ -553,6 +577,7 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
@@ -577,6 +602,7 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
@@ -601,6 +627,7 @@ define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true)
@@ -625,6 +652,7 @@ define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index 8ea10f4496a2e..f78b0a9c4ad2c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -11,6 +11,7 @@ define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
@@ -26,6 +27,7 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -42,6 +44,7 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -58,6 +61,7 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -75,6 +79,7 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -92,6 +97,7 @@ define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
@@ -107,6 +113,7 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -123,6 +130,7 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -139,6 +147,7 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -156,6 +165,7 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -173,6 +183,7 @@ define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
@@ -188,6 +199,7 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -204,6 +216,7 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -220,6 +233,7 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -237,6 +251,7 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -254,6 +269,7 @@ define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
@@ -269,6 +285,7 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -285,6 +302,7 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -301,6 +319,7 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -318,6 +337,7 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 8e9a652ae8a8e..ce382942315bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -587,7 +587,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -664,9 +665,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -680,9 +681,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -728,7 +729,8 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -805,9 +807,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -821,9 +823,9 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -3320,7 +3322,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -3366,7 +3369,8 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
@@ -3443,9 +3447,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -3459,9 +3463,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -3538,9 +3542,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
@@ -3554,9 +3558,9 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_movk_i32 s2, 0x1234
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v1, s2, 0xc1d1
; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
@@ -9213,6 +9217,7 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store half %v, ptr addrspace(1) %out
@@ -9251,6 +9256,7 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store half %v, ptr addrspace(1) %out
@@ -9289,6 +9295,7 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store bfloat %v, ptr addrspace(1) %out
@@ -9327,6 +9334,7 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store bfloat %v, ptr addrspace(1) %out
@@ -9365,6 +9373,7 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i16 %v, ptr addrspace(1) %out
@@ -9403,6 +9412,7 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i16 %v, ptr addrspace(1) %out
@@ -9441,6 +9451,7 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x half> %v, ptr addrspace(1) %out
@@ -9479,6 +9490,7 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x half> %v, ptr addrspace(1) %out
@@ -9541,6 +9553,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v2f32:
@@ -9556,6 +9569,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x float> %v, ptr addrspace(1) %out
@@ -9618,6 +9632,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v2f32:
@@ -9633,6 +9648,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x float> %v, ptr addrspace(1) %out
@@ -9728,6 +9744,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v7i32:
@@ -9750,6 +9767,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <7 x i32> %v, ptr addrspace(1) %out
@@ -9845,6 +9863,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v7i32:
@@ -9867,6 +9886,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <7 x i32> %v, ptr addrspace(1) %out
@@ -9939,6 +9959,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v8i16:
@@ -9956,6 +9977,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <8 x i16> %v, ptr addrspace(1) %out
@@ -10028,6 +10050,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v8i16:
@@ -10045,6 +10068,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <8 x i16> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
index bb42834221681..dca743939e706 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -38,6 +38,7 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr %v, ptr addrspace(1) %out
@@ -79,6 +80,7 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr %v, ptr addrspace(1) %out
@@ -137,6 +139,7 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr> %v, ptr addrspace(1) %out
@@ -195,6 +198,7 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr> %v, ptr addrspace(1) %out
@@ -233,6 +237,7 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -271,6 +276,7 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -315,6 +321,7 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -359,6 +366,7 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -397,6 +405,7 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -435,6 +444,7 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -479,6 +489,7 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -523,6 +534,7 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -561,6 +573,7 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -599,6 +612,7 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -643,6 +657,7 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
@@ -687,6 +702,7 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index 320b0b4508b6a..a827548f6abeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -12,6 +12,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -28,6 +29,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret <2 x bfloat> %ret
@@ -42,6 +44,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -57,6 +60,7 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret void
@@ -81,23 +85,27 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v5, s[4:7], s3 offen offset:128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr6
; GFX12-NEXT: ; implicit-def: $vgpr5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
index ce46e2755ae58..b137d3462e156 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
@@ -42,6 +42,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret void
@@ -85,6 +86,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret void
@@ -128,6 +130,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -171,6 +174,7 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret void
@@ -214,6 +218,7 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
index 327d80a7b67cd..46c816fb4c51a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
@@ -31,6 +31,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret float %ret
@@ -64,6 +65,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -97,6 +99,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
@@ -130,6 +133,7 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret <2 x half> %ret
@@ -163,6 +167,7 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 8bfe996c6a90a..8fdf604d95238 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -210,6 +210,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
@@ -222,6 +223,7 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
ret void
@@ -489,6 +491,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
; GCN-NEXT: global_load_b32 v0, v[0:1], off
; GCN-NEXT: global_load_b32 v1, v[2:3], off
@@ -496,6 +499,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
; GCN-NEXT: global_store_b32 v[7:8], v0, off
; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var:
@@ -516,8 +520,9 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off
@@ -526,6 +531,7 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0
; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -741,6 +747,7 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_barrier_init m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
@@ -752,11 +759,12 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0
; GLOBAL-ISEL-NEXT: s_barrier_init m0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
ret void
@@ -945,6 +953,7 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_barrier_join m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
@@ -956,6 +965,7 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_barrier_join m0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.join(i32 %arg)
ret void
@@ -1202,6 +1212,7 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_wakeup_barrier m0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
@@ -1213,6 +1224,7 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg)
ret void
@@ -1386,11 +1398,13 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_get_barrier_state s0, m0
; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0:
@@ -1403,8 +1417,10 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0
+; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
%state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
ret i32 %state
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index bc7052132a87b..70dff2c800a4b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -15,6 +15,7 @@ define void @test_s_sleep_var1(i32 %arg) {
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_sleep_var s0
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 %arg)
ret void
@@ -29,6 +30,7 @@ define void @test_s_sleep_var2() {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_sleep_var 10
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 10)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index 78204dfefc80c..d46f21d28556f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -12,6 +12,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
@@ -26,6 +27,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -50,23 +52,27 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB2_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
@@ -90,22 +96,26 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB3_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 1005996003044..2d03e3e122a70 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -46,6 +46,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -90,6 +91,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -137,6 +139,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
@@ -184,6 +187,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -306,22 +310,26 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -444,22 +452,26 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB5_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 5f6a67e466020..7f9712a283ecb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -35,6 +35,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -69,6 +70,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -106,6 +108,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -143,6 +146,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
@@ -235,23 +239,27 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -344,23 +352,27 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB5_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index bd803c380e90a..07d43ba5f2e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -56,6 +56,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -111,6 +112,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -167,6 +169,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -222,6 +225,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -274,6 +278,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -326,6 +331,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -380,6 +386,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -432,6 +439,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
@@ -554,18 +562,22 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +720,27 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr7
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index c9b50eddc94ee..0bcdc29d642c2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -56,6 +56,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -111,6 +112,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -167,6 +169,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -222,6 +225,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -274,6 +278,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -326,6 +331,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -380,6 +386,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -432,6 +439,7 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
@@ -554,18 +562,22 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s1, s1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s0 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -708,23 +720,27 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: s_and_b32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[5:6], s[4:7], s3 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX12-NEXT: ; implicit-def: $vgpr7
; GFX12-NEXT: ; implicit-def: $vgpr5_vgpr6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s2
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index df5533b629502..7cca57e36c7f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -49,9 +49,10 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s0, ttmp8, 0x50019
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%waveid = call i32 @llvm.amdgcn.wave.id()
store i32 %waveid, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 5d3a5800bcdd8..e7e0eb8ed370a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -74,6 +74,7 @@ define half @v_maximum_f16(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -128,6 +129,7 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -200,6 +202,7 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -254,6 +257,7 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -334,6 +338,7 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan half %arg0, 1.0
%op = call half @llvm.maximum.f16(half %src0, half %src1)
@@ -415,6 +420,7 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan half %arg1, 1.0
%op = call half @llvm.maximum.f16(half %src0, half %src1)
@@ -515,11 +521,13 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_maximum_f16 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.maximum.f16(half %src0, half %src1)
%cast = bitcast half %op to i16
@@ -631,6 +639,7 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -692,6 +701,7 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -800,6 +810,7 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -861,6 +872,7 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -1009,6 +1021,7 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
@@ -1143,6 +1156,7 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1215,6 +1229,7 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1347,6 +1362,7 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1419,6 +1435,7 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1584,6 +1601,7 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1663,6 +1681,7 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1828,6 +1847,7 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1907,6 +1927,7 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -2186,6 +2207,7 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
; GFX12-NEXT: v_pk_maximum_f16 v2, v2, v6
; GFX12-NEXT: v_pk_maximum_f16 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x half> @llvm.maximum.v8f16(<8 x half> %src0, <8 x half> %src1)
ret <8 x half> %op
@@ -2701,6 +2723,7 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX12-NEXT: v_pk_maximum_f16 v5, v5, v13
; GFX12-NEXT: v_pk_maximum_f16 v6, v6, v14
; GFX12-NEXT: v_pk_maximum_f16 v7, v7, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x half> @llvm.maximum.v16f16(<16 x half> %src0, <16 x half> %src1)
ret <16 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index e6655aeab7e9b..1a9be3bddb160 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -70,6 +70,7 @@ define float @v_maximum_f32(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -120,6 +121,7 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -188,6 +190,7 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -238,6 +241,7 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -314,6 +318,7 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) {
; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan float %arg0, 1.0
%op = call float @llvm.maximum.f32(float %src0, float %src1)
@@ -391,6 +396,7 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) {
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan float %arg1, 1.0
%op = call float @llvm.maximum.f32(float %src0, float %src1)
@@ -485,6 +491,7 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.maximum.f32(float %src0, float %src1)
call void asm sideeffect "; use $0", "s"(float %op)
@@ -573,6 +580,7 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -629,6 +637,7 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -716,6 +725,7 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -772,6 +782,7 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -888,6 +899,7 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
call void asm sideeffect "; use $0", "s"(<2 x float> %op)
@@ -996,6 +1008,7 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1059,6 +1072,7 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1166,6 +1180,7 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1229,6 +1244,7 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1356,6 +1372,7 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1425,6 +1442,7 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1552,6 +1570,7 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1621,6 +1640,7 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1826,6 +1846,7 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v5, v5, v13
; GFX12-NEXT: v_maximum_f32 v6, v6, v14
; GFX12-NEXT: v_maximum_f32 v7, v7, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x float> @llvm.maximum.v8f32(<8 x float> %src0, <8 x float> %src1)
ret <8 x float> %op
@@ -2227,6 +2248,7 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v14, v14, v30
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f32 v15, v15, v31
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x float> @llvm.maximum.v16f32(<16 x float> %src0, <16 x float> %src1)
ret <16 x float> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index 9a83c04cad1e3..d0122891f96b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -76,6 +76,7 @@ define double @v_maximum_f64(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -126,6 +127,7 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -200,6 +202,7 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -250,6 +253,7 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -333,6 +337,7 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan double %arg0, 1.0
%op = call double @llvm.maximum.f64(double %src0, double %src1)
@@ -417,6 +422,7 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
; GFX12-NEXT: v_add_f64_e32 v[2:3], 1.0, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan double %arg1, 1.0
%op = call double @llvm.maximum.f64(double %src0, double %src1)
@@ -520,6 +526,7 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -621,6 +628,7 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -678,6 +686,7 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -778,6 +787,7 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -835,6 +845,7 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -969,6 +980,7 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
@@ -1095,6 +1107,7 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1159,6 +1172,7 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1284,6 +1298,7 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1348,6 +1363,7 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1499,6 +1515,7 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1570,6 +1587,7 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1721,6 +1739,7 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1792,6 +1811,7 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -2058,6 +2078,7 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[28:29]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[30:31]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x double> @llvm.maximum.v8f64(<8 x double> %src0, <8 x double> %src1)
ret <8 x double> %op
@@ -2934,6 +2955,7 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 01effc24e741d..c237c0d1de2c9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -61,6 +61,7 @@ define half @v_minimum_f16(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -105,6 +106,7 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -164,6 +166,7 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -208,6 +211,7 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -274,6 +278,7 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan half %arg0, 1.0
%op = call half @llvm.minimum.f16(half %src0, half %src1)
@@ -341,6 +346,7 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan half %arg1, 1.0
%op = call half @llvm.minimum.f16(half %src0, half %src1)
@@ -424,11 +430,13 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_minimum_f16 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.minimum.f16(half %src0, half %src1)
%cast = bitcast half %op to i16
@@ -520,6 +528,7 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -566,6 +575,7 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -654,6 +664,7 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -700,6 +711,7 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -821,6 +833,7 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
@@ -928,6 +941,7 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -980,6 +994,7 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1085,6 +1100,7 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1137,6 +1153,7 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1268,6 +1285,7 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1322,6 +1340,7 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1453,6 +1472,7 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1507,6 +1527,7 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1724,6 +1745,7 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
; GFX12-NEXT: v_pk_minimum_f16 v2, v2, v6
; GFX12-NEXT: v_pk_minimum_f16 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x half> @llvm.minimum.v8f16(<8 x half> %src0, <8 x half> %src1)
ret <8 x half> %op
@@ -2119,6 +2141,7 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX12-NEXT: v_pk_minimum_f16 v5, v5, v13
; GFX12-NEXT: v_pk_minimum_f16 v6, v6, v14
; GFX12-NEXT: v_pk_minimum_f16 v7, v7, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x half> @llvm.minimum.v16f16(<16 x half> %src0, <16 x half> %src1)
ret <16 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 518fc27c23082..d981cc44903de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -70,6 +70,7 @@ define float @v_minimum_f32(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -120,6 +121,7 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -188,6 +190,7 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -238,6 +241,7 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -314,6 +318,7 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) {
; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan float %arg0, 1.0
%op = call float @llvm.minimum.f32(float %src0, float %src1)
@@ -391,6 +396,7 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) {
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan float %arg1, 1.0
%op = call float @llvm.minimum.f32(float %src0, float %src1)
@@ -485,6 +491,7 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.minimum.f32(float %src0, float %src1)
call void asm sideeffect "; use $0", "s"(float %op)
@@ -573,6 +580,7 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -629,6 +637,7 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -716,6 +725,7 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -772,6 +782,7 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -888,6 +899,7 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
call void asm sideeffect "; use $0", "s"(<2 x float> %op)
@@ -996,6 +1008,7 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1059,6 +1072,7 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1166,6 +1180,7 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1229,6 +1244,7 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1356,6 +1372,7 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1425,6 +1442,7 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1552,6 +1570,7 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1621,6 +1640,7 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1826,6 +1846,7 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v5, v5, v13
; GFX12-NEXT: v_minimum_f32 v6, v6, v14
; GFX12-NEXT: v_minimum_f32 v7, v7, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x float> @llvm.minimum.v8f32(<8 x float> %src0, <8 x float> %src1)
ret <8 x float> %op
@@ -2227,6 +2248,7 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v14, v14, v30
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f32 v15, v15, v31
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x float> @llvm.minimum.v16f32(<16 x float> %src0, <16 x float> %src1)
ret <16 x float> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 81b892d424b46..7cf68fdddf356 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -76,6 +76,7 @@ define double @v_minimum_f64(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -126,6 +127,7 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -200,6 +202,7 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -250,6 +253,7 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -333,6 +337,7 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan double %arg0, 1.0
%op = call double @llvm.minimum.f64(double %src0, double %src1)
@@ -417,6 +422,7 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
; GFX12-NEXT: v_add_f64_e32 v[2:3], 1.0, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan double %arg1, 1.0
%op = call double @llvm.minimum.f64(double %src0, double %src1)
@@ -520,6 +526,7 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -621,6 +628,7 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -678,6 +686,7 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -778,6 +787,7 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -835,6 +845,7 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -969,6 +980,7 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
@@ -1095,6 +1107,7 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1159,6 +1172,7 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1284,6 +1298,7 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1348,6 +1363,7 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1499,6 +1515,7 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1570,6 +1587,7 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1721,6 +1739,7 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1792,6 +1811,7 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -2058,6 +2078,7 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[28:29]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[30:31]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x double> @llvm.minimum.v8f64(<8 x double> %src0, <8 x double> %src1)
ret <8 x double> %op
@@ -2934,6 +2955,7 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 53ea253035655..28e1808b76e73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -127,6 +127,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -323,6 +324,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
@@ -780,6 +782,7 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
@@ -852,6 +855,7 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 7178eaf2e7384..4c59b77e52205 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2360,6 +2360,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
@@ -2384,11 +2385,12 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
@@ -2397,6 +2399,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
@@ -2794,6 +2797,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
@@ -2807,7 +2811,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
@@ -3454,6 +3458,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
; GFX12-NEXT: v_lshrrev_b16 v9, 13, s3
; GFX12-NEXT: v_and_b32_e32 v44, 1, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s4
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
@@ -3467,14 +3472,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v13, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v14, 1, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v2, 5, s5
; GFX12-NEXT: s_and_b32 s7, s2, 1
; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 3, s4
; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10017
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
; GFX12-NEXT: v_lshrrev_b16 v3, 3, s5
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s3
@@ -3489,30 +3496,34 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
; GFX12-NEXT: v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
; GFX12-NEXT: v_lshrrev_b16 v17, 6, s5
; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10016
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
; GFX12-NEXT: s_bfe_u32 s14, s2, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
; GFX12-NEXT: v_and_b32_e32 v11, 1, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 1, s5
; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10013
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
; GFX12-NEXT: v_lshrrev_b16 v7, 7, s5
; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10012
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10011
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
; GFX12-NEXT: s_bfe_u32 s20, s3, 0x10016
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_lshrrev_b16 v6, 2, s5
; GFX12-NEXT: s_bfe_u32 s21, s3, 0x10014
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
@@ -3522,7 +3533,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 14, s2
@@ -3541,6 +3552,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 1, v17
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v23
@@ -4266,6 +4278,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
@@ -4311,7 +4324,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s20, s3, 0x10016
; GFX12-NEXT: s_bfe_i32 s21, s3, 0x10014
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
@@ -6791,6 +6804,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
@@ -6808,6 +6822,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
@@ -6817,6 +6832,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
@@ -6827,6 +6843,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
@@ -6842,6 +6859,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
@@ -7554,6 +7572,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v62, v[30:33], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v62, v[26:29], s[0:1] offset:48
; GFX12-NEXT: global_store_b128 v62, v[8:11], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v62, v[4:7], s[0:1] offset:16
@@ -8449,6 +8468,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v43, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
; GFX12-NEXT: v_mov_b32_e32 v66, v1
; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
@@ -8457,6 +8477,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
; GFX12-NEXT: v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10016
@@ -8465,6 +8486,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v45, 1, v12
; GFX12-NEXT: v_and_b32_e32 v41, 1, v16
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: s_lshr_b32 s5, s2, 24
@@ -8473,6 +8495,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
; GFX12-NEXT: v_and_b32_e32 v44, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
; GFX12-NEXT: v_lshrrev_b16 v10, 3, s5
@@ -8483,6 +8506,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
; GFX12-NEXT: v_and_b32_e32 v33, 1, v20
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_lshrrev_b16 v9, 15, s3
@@ -8509,6 +8533,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
@@ -8518,6 +8543,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v82, 0xffff, v35
; GFX12-NEXT: v_and_b32_e32 v35, 1, v27
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
; GFX12-NEXT: v_and_b32_e32 v81, 0xffff, v4
@@ -8529,6 +8555,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v31
; GFX12-NEXT: v_and_b32_e32 v31, 1, v29
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
@@ -8538,6 +8565,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v21, 2, s2
; GFX12-NEXT: v_and_b32_e32 v33, 0xffff, v33
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: v_lshrrev_b16 v15, 8, s2
@@ -8561,6 +8589,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v39
; GFX12-NEXT: v_and_b32_e32 v39, 1, v25
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
; GFX12-NEXT: v_and_b32_e32 v79, 0xffff, v5
@@ -9818,6 +9847,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
; GFX12-NEXT: v_bfe_i32 v79, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v85, v65, 0, 1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v65, s40
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v12, v[69:72], s[0:1] offset:144
@@ -9903,6 +9933,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v50, 31, v49
; GFX12-NEXT: v_ashrrev_i32_e32 v88, 31, v87
; GFX12-NEXT: v_ashrrev_i32_e32 v86, 31, v85
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 355c296d122ff..d27f9806f2e04 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2954,6 +2954,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
; GFX12-NEXT: s_lshr_b32 s29, s10, 16
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
; GFX12-NEXT: s_lshr_b32 s28, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
@@ -2966,6 +2967,7 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: s_lshr_b32 s25, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_lshr_b32 s22, s5, 16
@@ -3444,6 +3446,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
; GFX12-NEXT: s_ashr_i32 s29, s10, 16
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
; GFX12-NEXT: s_ashr_i32 s28, s11, 16
; GFX12-NEXT: s_sext_i32_i16 s11, s11
@@ -3456,6 +3459,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: s_ashr_i32 s25, s6, 16
; GFX12-NEXT: s_sext_i32_i16 s7, s7
; GFX12-NEXT: s_sext_i32_i16 s6, s6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s28
; GFX12-NEXT: v_mov_b32_e32 v10, s11
; GFX12-NEXT: s_ashr_i32 s22, s5, 16
@@ -5787,10 +5791,11 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -6014,14 +6019,15 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
@@ -6362,23 +6368,28 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -6950,44 +6961,51 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s10, s5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
; GFX12-NEXT: s_lshr_b32 s5, s5, 16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s7, 16
; GFX12-NEXT: s_and_b32 s5, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s6, 16
; GFX12-NEXT: s_and_b32 s5, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
@@ -8031,84 +8049,99 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s18, s15, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18
; GFX12-NEXT: s_lshr_b32 s15, s15, 16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s15, s14, 16
; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:240
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s14
; GFX12-NEXT: v_mov_b32_e32 v2, s15
; GFX12-NEXT: s_lshr_b32 s14, s13, 16
; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s13
; GFX12-NEXT: v_mov_b32_e32 v2, s14
; GFX12-NEXT: s_lshr_b32 s13, s12, 16
; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:208
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s12
; GFX12-NEXT: v_mov_b32_e32 v2, s13
; GFX12-NEXT: s_lshr_b32 s12, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s12
; GFX12-NEXT: s_lshr_b32 s11, s10, 16
; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s10
; GFX12-NEXT: v_mov_b32_e32 v2, s11
; GFX12-NEXT: s_lshr_b32 s10, s9, 16
; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s9
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s9, s8, 16
; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
; GFX12-NEXT: s_lshr_b32 s8, s7, 16
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_lshr_b32 s7, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, s7
; GFX12-NEXT: s_lshr_b32 s6, s5, 16
; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
@@ -8911,12 +8944,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000
; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s67
; GFX12-NEXT: v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s65
; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
@@ -8929,6 +8964,7 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[16:17] offset:192
; GFX12-NEXT: v_dual_mov_b32 v1, s53 :: v_dual_mov_b32 v0, s52
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
; GFX12-NEXT: v_dual_mov_b32 v5, s45 :: v_dual_mov_b32 v4, s44
; GFX12-NEXT: v_dual_mov_b32 v7, s51 :: v_dual_mov_b32 v6, s50
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index f1a6bccc559f0..4ab55164e0999 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -4390,6 +4390,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s22
; GFX12-NEXT: v_dual_mov_b32 v3, s57 :: v_dual_mov_b32 v2, s23
; GFX12-NEXT: v_mov_b32_e32 v5, s56
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 67a376b8c0f3c..aa80081f84ffe 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1119,8 +1119,8 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
; GFX12-NEXT: s_and_b32 s3, s2, 0xff
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -1223,8 +1223,8 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
; GFX12-NEXT: s_sext_i32_i8 s3, s2
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
@@ -1332,6 +1332,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
@@ -1439,6 +1440,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_ashr_i32 s3, s2, 24
; GFX12-NEXT: s_sext_i32_i8 s4, s2
; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_mov_b32_e32 v2, s2
@@ -1597,6 +1599,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_mov_b32_e32 v6, s3
@@ -1761,6 +1764,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: s_sext_i32_i8 s3, s3
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_mov_b32_e32 v4, s3
@@ -2018,11 +2022,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s3, s5, 24
; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: s_lshr_b32 s2, s4, 24
; GFX12-NEXT: s_and_b32 s10, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
@@ -2294,6 +2300,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: s_ashr_i32 s2, s4, 24
@@ -2305,6 +2312,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v8, s5
; GFX12-NEXT: v_mov_b32_e32 v10, s9
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v12, s4
; GFX12-NEXT: v_mov_b32_e32 v14, s3
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
@@ -2753,7 +2761,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s21, s7, 0xff
; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
@@ -2767,6 +2775,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
; GFX12-NEXT: s_lshr_b32 s3, s5, 24
; GFX12-NEXT: s_and_b32 s19, s5, 0xff
@@ -2776,6 +2785,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s18, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
@@ -3263,6 +3273,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: s_sext_i32_i8 s7, s7
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
; GFX12-NEXT: v_mov_b32_e32 v2, s11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
@@ -3276,6 +3287,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
; GFX12-NEXT: v_mov_b32_e32 v30, s19
; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
@@ -3288,6 +3300,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v20, s6
; GFX12-NEXT: v_mov_b32_e32 v22, s15
; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v16, s5
; GFX12-NEXT: v_mov_b32_e32 v18, s13
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
@@ -4116,11 +4129,13 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
; GFX12-NEXT: v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v58, s15
; GFX12-NEXT: s_and_b32 s43, s8, 0xff
; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX12-NEXT: s_and_b32 s48, s13, 0xff
; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0
; GFX12-NEXT: v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8
; GFX12-NEXT: s_lshr_b32 s27, s9, 24
@@ -4132,6 +4147,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
; GFX12-NEXT: s_and_b32 s47, s12, 0xff
; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2
; GFX12-NEXT: v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42
; GFX12-NEXT: s_lshr_b32 s25, s7, 24
@@ -4139,6 +4155,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12
; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7
; GFX12-NEXT: s_lshr_b32 s28, s10, 24
; GFX12-NEXT: s_lshr_b32 s29, s11, 24
@@ -4148,6 +4165,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s45, s10, 0xff
; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX12-NEXT: s_and_b32 s46, s11, 0xff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4
; GFX12-NEXT: v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41
; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
@@ -4162,10 +4180,11 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v60, v[52:55], s[16:17] offset:224
; GFX12-NEXT: global_store_b128 v60, v[48:51], s[16:17] offset:208
; GFX12-NEXT: global_store_b128 v60, v[44:47], s[16:17] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29
; GFX12-NEXT: v_mov_b32_e32 v24, s6
; GFX12-NEXT: s_and_b32 s40, s5, 0xff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40
; GFX12-NEXT: s_lshr_b32 s23, s5, 24
; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
@@ -4175,6 +4194,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_and_b32 s39, s4, 0xff
; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4
; GFX12-NEXT: s_lshr_b32 s21, s3, 24
; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
@@ -4187,10 +4207,12 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v60, v[26:29], s[16:17] offset:112
; GFX12-NEXT: global_store_b128 v60, v[22:25], s[16:17] offset:96
; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v14, s3
; GFX12-NEXT: s_lshr_b32 s20, s2, 24
; GFX12-NEXT: s_and_b32 s37, s2, 0xff
; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37
; GFX12-NEXT: s_lshr_b32 s19, s1, 24
; GFX12-NEXT: s_and_b32 s36, s1, 0xff
@@ -4199,6 +4221,7 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s18, s0, 24
; GFX12-NEXT: s_and_b32 s35, s0, 0xff
; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19
; GFX12-NEXT: v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
@@ -5061,6 +5084,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_ashr_i32 s47, s14, 24
; GFX12-NEXT: s_bfe_i32 s48, s14, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s14, s14
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s11
; GFX12-NEXT: s_ashr_i32 s45, s13, 24
@@ -5080,6 +5104,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s42, s11, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s11, s11
; GFX12-NEXT: v_bfe_i32 v45, v3, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43
; GFX12-NEXT: v_mov_b32_e32 v46, s46
; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
@@ -5104,12 +5129,14 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v59, v[40:43], s[16:17] offset:192
; GFX12-NEXT: v_mov_b32_e32 v41, s39
; GFX12-NEXT: v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v37, s37
; GFX12-NEXT: s_ashr_i32 s33, s7, 24
; GFX12-NEXT: s_ashr_i32 s35, s8, 24
; GFX12-NEXT: s_bfe_i32 s36, s8, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s8, s8
; GFX12-NEXT: v_bfe_i32 v39, v7, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35
; GFX12-NEXT: v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33
; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
@@ -5131,6 +5158,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s29, s5, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s5, s5
; GFX12-NEXT: v_bfe_i32 v31, v10, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26
; GFX12-NEXT: v_mov_b32_e32 v32, s36
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
@@ -5158,6 +5186,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
; GFX12-NEXT: s_bfe_i32 s23, s2, 0x80010
; GFX12-NEXT: s_sext_i32_i8 s2, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v16, s4
; GFX12-NEXT: v_mov_b32_e32 v18, s27
; GFX12-NEXT: s_bfe_i32 s21, s1, 0x80010
@@ -5171,6 +5200,7 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v8, s2
; GFX12-NEXT: v_mov_b32_e32 v10, s23
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v4, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s21
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -5869,7 +5899,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_lshr_b32 s4, s2, 24
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
@@ -5877,6 +5907,7 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -6027,6 +6058,7 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v6, s6
@@ -6225,26 +6257,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_lshr_b32 s5, s3, 24
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s4, s2, 24
; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: s_and_b32 s2, s3, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
@@ -6490,7 +6526,7 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
@@ -6831,47 +6867,56 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_lshr_b32 s3, s7, 24
; GFX12-NEXT: s_lshr_b32 s2, s5, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s2, s6, 24
; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s2, s4, 24
; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_and_b32 s2, s6, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
; GFX12-NEXT: s_and_b32 s2, s7, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: s_and_b32 s2, s5, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
; GFX12-NEXT: s_and_b32 s2, s4, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
@@ -7303,6 +7348,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
@@ -7939,48 +7985,57 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
; GFX12-NEXT: s_lshr_b32 s11, s7, 24
; GFX12-NEXT: s_lshr_b32 s10, s5, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
; GFX12-NEXT: s_and_b32 s7, s7, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:240
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s3, 24
; GFX12-NEXT: s_bfe_u32 s11, s3, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:176
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s1, 24
; GFX12-NEXT: s_bfe_u32 s11, s1, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:112
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s6, 24
; GFX12-NEXT: s_bfe_u32 s11, s6, 0x80010
; GFX12-NEXT: s_and_b32 s6, s6, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s4, 24
; GFX12-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:208
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s2, 24
; GFX12-NEXT: s_bfe_u32 s11, s2, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:144
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s10, s0, 24
; GFX12-NEXT: s_bfe_u32 s11, s0, 0x80010
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:80
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
@@ -7996,6 +8051,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: s_and_b32 s4, s4, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:192
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8008,6 +8064,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
; GFX12-NEXT: s_and_b32 s2, s2, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:128
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8020,6 +8077,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
; GFX12-NEXT: s_and_b32 s0, s0, 0xff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:64
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
@@ -8866,6 +8924,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47
; GFX12-NEXT: v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5
; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15
; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13
@@ -9605,14 +9664,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
; GFX12-NEXT: s_lshr_b32 s2, s2, 24
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -9758,11 +9819,13 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
; GFX12-NEXT: s_ashr_i32 s2, s2, 24
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -9949,6 +10012,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
@@ -9961,6 +10025,7 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: s_lshr_b32 s2, s3, 24
; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
@@ -10187,8 +10252,10 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: s_ashr_i32 s2, s2, 24
; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
@@ -10513,6 +10580,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
@@ -10537,6 +10605,7 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
@@ -10926,6 +10995,7 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
@@ -10937,14 +11007,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
@@ -11535,6 +11607,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s5
; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s25
; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s0
; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s23
@@ -11545,6 +11618,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s2
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v19, 0xff, s17
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
@@ -11566,6 +11640,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v6, v4, 16, v10
; GFX12-NEXT: v_lshl_or_b32 v4, v3, 16, v11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v3, s24, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v10, v9, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v8, v8, 16, v13
@@ -11580,6 +11655,7 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s16, s5, 24
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v11, s16, 16, v17
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
@@ -11593,11 +11669,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_lshr_b32 s18, s2, 24
; GFX12-NEXT: v_lshl_or_b32 v14, v5, 16, v9
; GFX12-NEXT: v_lshl_or_b32 v12, v1, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v15, s12, 16, v13
; GFX12-NEXT: v_lshl_or_b32 v13, s10, 16, v17
; GFX12-NEXT: s_lshr_b32 s22, s0, 24
; GFX12-NEXT: v_lshl_or_b32 v9, s14, 16, v19
; GFX12-NEXT: v_lshl_or_b32 v5, s18, 16, v18
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshl_or_b32 v1, s22, 16, v20
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[8:9] offset:48
@@ -12316,6 +12394,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
; GFX12-NEXT: s_bfe_i32 s19, s0, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v5, 8, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ashr_i64 s[0:1], s[4:5], 56
; GFX12-NEXT: v_and_b32_e64 v10, 0xffff, s2
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s20
@@ -12323,6 +12402,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v3, 8, s3
; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
; GFX12-NEXT: s_bfe_i32 s2, s15, 0x80000
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s12, 0x80000
; GFX12-NEXT: v_and_b32_e64 v2, 0xffff, s18
@@ -12333,9 +12413,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s2
; GFX12-NEXT: v_lshl_or_b32 v4, v4, 16, v10
; GFX12-NEXT: v_lshl_or_b32 v10, v5, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s7, 0x80000
; GFX12-NEXT: s_lshr_b32 s11, s7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
; GFX12-NEXT: s_lshr_b32 s10, s6, 16
@@ -12344,9 +12426,11 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v8
; GFX12-NEXT: v_lshl_or_b32 v8, v7, 16, v13
; GFX12-NEXT: v_lshl_or_b32 v7, v11, 16, v15
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s11, 0x80000
; GFX12-NEXT: s_lshr_b32 s13, s5, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v22, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s1, s10, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v9, 8, s17
@@ -12355,6 +12439,7 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
; GFX12-NEXT: v_ashrrev_i16 v21, 8, s11
; GFX12-NEXT: v_ashrrev_i16 v23, 8, s10
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v24, 0xffff, s1
; GFX12-NEXT: s_bfe_i32 s5, s16, 0x80000
; GFX12-NEXT: v_ashrrev_i16 v1, 8, s12
@@ -12362,11 +12447,13 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: s_bfe_i32 s4, s13, 0x80000
; GFX12-NEXT: v_and_b32_e64 v20, 0xffff, s3
; GFX12-NEXT: v_ashrrev_i16 v17, 8, s16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v19, 0xffff, s5
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s4, s0
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: v_lshl_or_b32 v3, v9, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v14, v11, 16, v12
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v11, s0
; GFX12-NEXT: v_lshl_or_b32 v12, v13, 16, v15
; GFX12-NEXT: v_lshl_or_b32 v15, v21, 16, v22
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 65614a17fc011..b755c439b5250 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -27,6 +27,7 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32:
@@ -143,6 +144,7 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32__offset:
@@ -260,6 +262,7 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32:
@@ -374,6 +377,7 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32__offset:
@@ -503,12 +507,14 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f64:
@@ -694,12 +700,14 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f64__offset:
@@ -884,12 +892,14 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f64:
@@ -1066,12 +1076,14 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f64__offset:
@@ -1267,13 +1279,15 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16:
@@ -1573,13 +1587,15 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -1887,12 +1903,14 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16:
@@ -2181,12 +2199,14 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16__offset:
@@ -2474,13 +2494,15 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f16__offset__align4:
@@ -2710,12 +2732,14 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f16__offset__align4:
@@ -2958,13 +2982,15 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16:
@@ -3312,13 +3338,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -3674,12 +3702,14 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16:
@@ -4016,12 +4046,14 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset:
@@ -4357,13 +4389,15 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
@@ -4648,12 +4682,14 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_bf16__offset__align4:
@@ -4914,6 +4950,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2f16:
@@ -5138,6 +5175,7 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2f16__offset:
@@ -5362,6 +5400,7 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2f16:
@@ -5577,6 +5616,7 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2f16__offset:
@@ -5798,6 +5838,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2bf16:
@@ -6106,6 +6147,7 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2bf16__offset:
@@ -6415,6 +6457,7 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2bf16:
@@ -6713,6 +6756,7 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
@@ -7010,8 +7054,9 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s1, s5, 4
@@ -7019,36 +7064,41 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_cbranch_execz .LBB28_2
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX12-NEXT: s_cbranch_execz .LBB28_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_4:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
@@ -7061,22 +7111,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: ; implicit-def: $vgpr0
; GFX12-NEXT: .LBB28_5: ; %ComputeLoop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ctz_i32_b32 s5, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s5
; GFX12-NEXT: s_lshl_b32 s7, 1, s5
; GFX12-NEXT: v_writelane_b32 v0, s0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12-NEXT: s_cbranch_execz .LBB28_8
; GFX12-NEXT: ; %bb.7:
@@ -7086,6 +7140,7 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_8:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: v_readfirstlane_b32 s2, v1
@@ -7877,8 +7932,9 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s1, s5, 4
@@ -7886,31 +7942,37 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_cbranch_execz .LBB29_2
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: .LBB29_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0
; GFX12-NEXT: s_mov_b32 s6, exec_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX12-NEXT: s_cbranch_execz .LBB29_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: .LBB29_4:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
@@ -7923,28 +7985,33 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: ; implicit-def: $vgpr0
; GFX12-NEXT: .LBB29_5: ; %ComputeLoop
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_ctz_i32_b32 s5, s1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s5
; GFX12-NEXT: s_lshl_b32 s7, 1, s5
; GFX12-NEXT: v_writelane_b32 v0, s0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX12-NEXT: s_cbranch_execz .LBB29_8
; GFX12-NEXT: ; %bb.7:
; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2
; GFX12-NEXT: .LBB29_8:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -8725,6 +8792,7 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode:
@@ -8841,6 +8909,7 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 6dec36c316ee3..86d0eda70ff36 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -27,6 +27,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
@@ -117,6 +118,7 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__offset:
@@ -209,6 +211,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32:
@@ -299,6 +302,7 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__offset:
@@ -396,6 +400,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64:
@@ -494,6 +499,7 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64__offset:
@@ -594,6 +600,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64:
@@ -692,6 +699,7 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64__offset:
@@ -816,13 +824,15 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16:
@@ -1129,13 +1139,15 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1450,12 +1462,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16:
@@ -1752,12 +1766,14 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset:
@@ -2053,13 +2069,15 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f16__offset__align4:
@@ -2297,12 +2315,14 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f16__offset__align4:
@@ -2552,13 +2572,15 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16:
@@ -2908,13 +2930,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3272,12 +3296,14 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16:
@@ -3616,12 +3642,14 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset:
@@ -3959,13 +3987,15 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
@@ -4252,12 +4282,14 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_bf16__offset__align4:
@@ -4531,13 +4563,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16:
@@ -4802,13 +4836,15 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2f16__offset:
@@ -5073,12 +5109,14 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16:
@@ -5334,12 +5372,14 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2f16__offset:
@@ -5618,13 +5658,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16:
@@ -5994,13 +6036,15 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_v2bf16__offset:
@@ -6370,12 +6414,14 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16:
@@ -6733,12 +6779,14 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_v2bf16__ofset:
@@ -7076,6 +7124,7 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
@@ -7166,6 +7215,7 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index b3132a2fa80dd..1b112aff833a0 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -27,6 +27,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32:
@@ -117,6 +118,7 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__offset:
@@ -209,6 +211,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32:
@@ -299,6 +302,7 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__offset:
@@ -396,6 +400,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64:
@@ -494,6 +499,7 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64__offset:
@@ -594,6 +600,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64:
@@ -692,6 +699,7 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64__offset:
@@ -816,13 +824,15 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16:
@@ -1129,13 +1139,15 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1450,12 +1462,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16:
@@ -1752,12 +1766,14 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset:
@@ -2053,13 +2069,15 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f16__offset__align4:
@@ -2297,12 +2315,14 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f16__offset__align4:
@@ -2552,13 +2572,15 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16:
@@ -2908,13 +2930,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3272,12 +3296,14 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16:
@@ -3616,12 +3642,14 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset:
@@ -3959,13 +3987,15 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
@@ -4252,12 +4282,14 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_bf16__offset__align4:
@@ -4531,13 +4563,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16:
@@ -4802,13 +4836,15 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2f16__offset:
@@ -5073,12 +5109,14 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16:
@@ -5334,12 +5372,14 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2f16__offset:
@@ -5618,13 +5658,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16:
@@ -5994,13 +6036,15 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_v2bf16__offset:
@@ -6370,12 +6414,14 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16:
@@ -6733,12 +6779,14 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_v2bf16__ofset:
@@ -7076,6 +7124,7 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
@@ -7166,6 +7215,7 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 5ebeddd04b2ae..9bc8bafc34a68 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -35,13 +35,15 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB0_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32:
@@ -246,13 +248,15 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB1_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32__offset:
@@ -457,12 +461,14 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32:
@@ -657,12 +663,14 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB3_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32__offset:
@@ -865,12 +873,14 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB4_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f64:
@@ -1081,12 +1091,14 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB5_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f64__offset:
@@ -1296,12 +1308,14 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f64:
@@ -1501,12 +1515,14 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f64__offset:
@@ -1725,13 +1741,15 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB8_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16:
@@ -2031,13 +2049,15 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2345,12 +2365,14 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16:
@@ -2639,12 +2661,14 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16__offset:
@@ -2932,13 +2956,15 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f16__offset__align4:
@@ -3168,12 +3194,14 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f16__offset__align4:
@@ -3416,13 +3444,15 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16:
@@ -3770,13 +3800,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4132,12 +4164,14 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB16_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16:
@@ -4474,12 +4508,14 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB17_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset:
@@ -4815,13 +4851,15 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
@@ -5106,12 +5144,14 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB19_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_bf16__offset__align4:
@@ -5381,13 +5421,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2f16:
@@ -5635,13 +5677,15 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB21_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2f16__offset:
@@ -5888,12 +5932,14 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB22_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2f16:
@@ -6130,12 +6176,14 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB23_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2f16__offset:
@@ -6398,13 +6446,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB24_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2bf16:
@@ -6774,13 +6824,15 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB25_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_v2bf16__offset:
@@ -7150,12 +7202,14 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB26_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16:
@@ -7513,12 +7567,14 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3
; GFX12-NEXT: v_mov_b32_e32 v3, v4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_cbranch_execnz .LBB27_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_v2bf16__ofset:
@@ -7864,13 +7920,15 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB28_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode:
@@ -8074,12 +8132,14 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB29_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 390d1d70ff2aa..df954f6f940c8 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
@@ -149,6 +149,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_add_co_i32 s2, s2, -1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index fef1b57db5685..74710fb1aa01b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -200,6 +200,7 @@ define amdgpu_kernel void @caller() {
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -212,6 +213,7 @@ define amdgpu_kernel void @caller() {
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX12-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -276,9 +278,10 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
@@ -287,6 +290,7 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%id.x = call i32 @llvm.amdgcn.workgroup.id.x()
%id.y = call i32 @llvm.amdgcn.workgroup.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index 8009f917aef5a..f253b2a80cf9d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -107,6 +107,7 @@ define amdgpu_cs void @caller() {
; GFX12-SDAG-NEXT: s_mov_b32 s1, callee at abs32@hi
; GFX12-SDAG-NEXT: s_mov_b32 s0, callee at abs32@lo
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -116,6 +117,7 @@ define amdgpu_cs void @caller() {
; GFX12-GISEL-NEXT: s_mov_b32 s0, callee at abs32@lo
; GFX12-GISEL-NEXT: s_mov_b32 s1, callee at abs32@hi
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX12-GISEL-NEXT: s_endpgm
%idx = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -171,9 +173,10 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s0, ttmp7, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
@@ -182,6 +185,7 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%id.x = call i32 @llvm.amdgcn.workgroup.id.x()
%id.y = call i32 @llvm.amdgcn.workgroup.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 8eb0a46cc8b17..62a6edd9e743d 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -53,6 +53,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -105,6 +106,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -157,6 +159,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
@@ -209,6 +212,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
@@ -395,6 +399,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i128
%sext1 = sext i32 %arg1 to i128
@@ -447,6 +452,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i63
%sext1 = sext i32 %arg1 to i63
@@ -514,6 +520,7 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 31
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i31 %arg0 to i63
%sext1 = sext i31 %arg1 to i63
@@ -589,6 +596,7 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
@@ -641,6 +649,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 4294967295
@@ -715,6 +724,7 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
%trunc.rhs = and i64 %arg1, 4294967295
@@ -790,6 +800,7 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5]
; GFX12-NEXT: v_and_b32_e32 v2, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 8589934591
@@ -842,6 +853,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%shl.lhs = shl i64 %arg0, 32
%trunc.lhs = ashr i64 %shl.lhs, 32
@@ -897,6 +909,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp4 = lshr i64 %arg0, 32
%tmp5 = and i64 %arg0, 4294967295
@@ -1063,6 +1076,7 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1178,6 +1192,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1259,6 +1274,7 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1331,6 +1347,7 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%m = mul i48 %arg0, %arg1
%a = add i48 %m, %arg2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 45e8b3bcff13c..e9b4ec52599a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-WGP-LABEL: flat_agent_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX12-CU-LABEL: flat_agent_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-WGP-LABEL: flat_agent_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX12-CU-LABEL: flat_agent_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -547,6 +551,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-WGP-LABEL: flat_agent_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -565,6 +570,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX12-CU-LABEL: flat_agent_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -765,6 +771,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-WGP-LABEL: flat_agent_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -787,6 +794,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-LABEL: flat_agent_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1550,6 +1558,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1562,6 +1571,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1729,6 +1739,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1743,6 +1754,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1901,6 +1913,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1918,6 +1931,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-LABEL: flat_agent_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2106,6 +2120,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2125,6 +2140,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2315,6 +2331,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2334,6 +2351,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3215,6 +3233,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3231,6 +3250,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3473,6 +3493,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3491,6 +3512,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3724,6 +3746,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3745,6 +3768,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4008,6 +4032,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4031,6 +4056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4296,6 +4322,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4319,6 +4346,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4568,6 +4596,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4586,6 +4615,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4830,6 +4860,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4848,6 +4879,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5108,6 +5140,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5131,6 +5164,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5396,6 +5430,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5419,6 +5454,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5684,6 +5720,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5707,6 +5744,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5972,6 +6010,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5995,6 +6034,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6260,6 +6300,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6283,6 +6324,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6548,6 +6590,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6571,6 +6614,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6880,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6859,6 +6904,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7124,6 +7170,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7147,6 +7194,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7401,6 +7449,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7421,6 +7470,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7687,6 +7737,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7710,6 +7761,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7982,6 +8034,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8007,6 +8060,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8294,6 +8348,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8322,6 +8377,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8612,6 +8668,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8640,6 +8697,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8914,6 +8972,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8937,6 +8996,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9206,6 +9266,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9229,6 +9290,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9514,6 +9576,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9542,6 +9605,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9832,6 +9896,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9860,6 +9925,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10150,6 +10216,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10178,6 +10245,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10468,6 +10536,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10496,6 +10565,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10786,6 +10856,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10814,6 +10885,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11104,6 +11176,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11132,6 +11205,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11422,6 +11496,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11450,6 +11525,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11740,6 +11816,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11768,6 +11845,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11953,6 +12031,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11968,6 +12047,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX12-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12138,6 +12218,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12153,6 +12234,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12344,6 +12426,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12363,6 +12446,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12572,6 +12656,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12595,6 +12680,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13359,6 +13445,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13371,6 +13458,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13534,6 +13622,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13548,6 +13637,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13706,6 +13796,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13723,6 +13814,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13907,6 +13999,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13926,6 +14019,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14112,6 +14206,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14131,6 +14226,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -15042,6 +15138,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15058,6 +15155,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15296,6 +15394,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15314,6 +15413,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15547,6 +15647,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15568,6 +15669,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15827,6 +15929,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15850,6 +15953,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16111,6 +16215,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16134,6 +16239,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16379,6 +16485,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16397,6 +16504,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16637,6 +16745,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16655,6 +16764,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16911,6 +17021,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16934,6 +17045,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17195,6 +17307,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17218,6 +17331,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17479,6 +17593,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17502,6 +17617,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17763,6 +17879,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17786,6 +17903,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18047,6 +18165,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18070,6 +18189,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18331,6 +18451,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18354,6 +18475,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18615,6 +18737,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18638,6 +18761,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18899,6 +19023,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18922,6 +19047,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19176,6 +19302,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19196,6 +19323,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19470,6 +19598,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19494,6 +19623,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19767,6 +19897,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19792,6 +19923,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20087,6 +20219,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20116,6 +20249,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20415,6 +20549,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20444,6 +20579,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20727,6 +20863,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20751,6 +20888,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21167,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21053,6 +21192,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21347,6 +21487,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21376,6 +21517,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21675,6 +21817,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21704,6 +21847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22003,6 +22147,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22032,6 +22177,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22331,6 +22477,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22360,6 +22507,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22659,6 +22807,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22688,6 +22837,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22987,6 +23137,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23016,6 +23167,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23315,6 +23467,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23344,6 +23497,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23643,6 +23797,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23672,6 +23827,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index fb40274cac1ba..5c59481c59853 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -29,18 +30,23 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-NEXT: s_mov_b32 s2, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-NEXT: s_mov_b32 s2, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s3, s4
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_mov_b32 s2, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v2
; GFX12-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -64,6 +70,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-LABEL: flat_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -88,6 +95,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5fa8e6891bafb..b2340caa2933f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -475,18 +477,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -504,18 +511,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -688,6 +700,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -703,6 +716,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1007,17 +1021,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1036,17 +1055,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -1224,6 +1248,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1242,6 +1267,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 4c9ce15211e34..304c80d7bb24d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-WGP-LABEL: flat_singlethread_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX12-CU-LABEL: flat_singlethread_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX12-CU-LABEL: flat_singlethread_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-WGP-LABEL: flat_singlethread_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX12-CU-LABEL: flat_singlethread_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20317,6 +20471,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20337,6 +20492,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index e77f1432c1c9d..3502a29edeecb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-WGP-LABEL: flat_system_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX12-CU-LABEL: flat_system_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-WGP-LABEL: flat_system_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX12-CU-LABEL: flat_system_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +553,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-WGP-LABEL: flat_system_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -567,6 +572,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX12-CU-LABEL: flat_system_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -769,6 +775,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-WGP-LABEL: flat_system_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -791,6 +798,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-LABEL: flat_system_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1558,6 +1566,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1570,6 +1579,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1739,6 +1749,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1753,6 +1764,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1913,6 +1925,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-WGP-LABEL: flat_system_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1930,6 +1943,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-LABEL: flat_system_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2122,6 +2136,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2141,6 +2156,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2335,6 +2351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2354,6 +2371,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3245,6 +3263,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3261,6 +3280,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3505,6 +3525,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3523,6 +3544,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3758,6 +3780,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3779,6 +3802,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4046,6 +4070,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4069,6 +4094,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4338,6 +4364,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4361,6 +4388,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4612,6 +4640,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4630,6 +4659,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4876,6 +4906,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4894,6 +4925,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5158,6 +5190,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5181,6 +5214,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5450,6 +5484,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5473,6 +5508,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5742,6 +5778,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5765,6 +5802,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6034,6 +6072,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6096,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6326,6 +6366,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6349,6 +6390,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6618,6 +6660,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6641,6 +6684,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6910,6 +6954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6933,6 +6978,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7202,6 +7248,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7225,6 +7272,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7479,6 +7527,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7499,6 +7548,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7767,6 +7817,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7790,6 +7841,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8064,6 +8116,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8089,6 +8142,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8380,6 +8434,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8408,6 +8463,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8702,6 +8758,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8730,6 +8787,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9006,6 +9064,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9029,6 +9088,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9300,6 +9360,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9384,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9612,6 +9674,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9640,6 +9703,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9934,6 +9998,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9962,6 +10027,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10256,6 +10322,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10284,6 +10351,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10578,6 +10646,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10606,6 +10675,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10900,6 +10970,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10928,6 +10999,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11222,6 +11294,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11250,6 +11323,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11544,6 +11618,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11572,6 +11647,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11866,6 +11942,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -11894,6 +11971,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -12079,6 +12157,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12094,6 +12173,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX12-CU-LABEL: flat_system_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12264,6 +12344,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12279,6 +12360,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12472,6 +12554,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12491,6 +12574,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX12-CU-LABEL: flat_system_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12702,6 +12786,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12725,6 +12810,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13493,6 +13579,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13505,6 +13592,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13670,6 +13758,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13684,6 +13773,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13844,6 +13934,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -13861,6 +13952,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14049,6 +14141,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14068,6 +14161,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -14258,6 +14352,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -14277,6 +14372,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -15198,6 +15294,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15214,6 +15311,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15454,6 +15552,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15472,6 +15571,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15707,6 +15807,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15728,6 +15829,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15991,6 +16093,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16014,6 +16117,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16279,6 +16383,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16302,6 +16407,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16549,6 +16655,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16567,6 +16674,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16809,6 +16917,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16827,6 +16936,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17087,6 +17197,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17110,6 +17221,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17375,6 +17487,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17398,6 +17511,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17663,6 +17777,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17686,6 +17801,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17951,6 +18067,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17974,6 +18091,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18239,6 +18357,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18262,6 +18381,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18527,6 +18647,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18550,6 +18671,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18815,6 +18937,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18838,6 +18961,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19103,6 +19227,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19126,6 +19251,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19380,6 +19506,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19400,6 +19527,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19676,6 +19804,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19700,6 +19829,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19975,6 +20105,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20000,6 +20131,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20299,6 +20431,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20328,6 +20461,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20631,6 +20765,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20660,6 +20795,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20945,6 +21081,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20969,6 +21106,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21249,6 +21387,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21273,6 +21412,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21571,6 +21711,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21600,6 +21741,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21903,6 +22045,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21932,6 +22075,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22235,6 +22379,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22264,6 +22409,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22567,6 +22713,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22596,6 +22743,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22899,6 +23047,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -22928,6 +23077,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23231,6 +23381,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23260,6 +23411,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23563,6 +23715,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23592,6 +23745,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23895,6 +24049,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -23924,6 +24079,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 6bf54ccabc9da..c2b7aa4fcfbf1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -110,6 +110,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-WGP-LABEL: flat_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -128,6 +129,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-LABEL: flat_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -329,18 +331,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-WGP-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr2
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, s4
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v1
; GFX12-WGP-NEXT: s_mov_b32 s2, s5
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v2
; GFX12-WGP-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -361,18 +368,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_mov_b32 s2, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr2
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, s4
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v1
; GFX12-CU-NEXT: s_mov_b32 s2, s5
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v2
; GFX12-CU-NEXT: v_add_co_u32 v0, s3, s3, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v2, s2, s2, v1, s3
@@ -498,6 +510,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-WGP-LABEL: flat_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -518,6 +531,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX12-CU-LABEL: flat_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -727,17 +741,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-WGP-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-WGP-NEXT: s_mov_b32 s0, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-WGP-NEXT: s_mov_b32 s0, 0
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: ; implicit-def: $sgpr0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v4, v0
; GFX12-WGP-NEXT: s_mov_b32 s1, s2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, v3
; GFX12-WGP-NEXT: s_mov_b32 s0, s3
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v4
; GFX12-WGP-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-WGP-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -761,17 +780,22 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX12-CU-NEXT: s_mov_b32 s0, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s0
; GFX12-CU-NEXT: s_mov_b32 s0, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v3, s0, v0
; GFX12-CU-NEXT: s_mov_b32 s0, 0
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: ; implicit-def: $sgpr0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v4, v0
; GFX12-CU-NEXT: s_mov_b32 s1, s2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, v3
; GFX12-CU-NEXT: s_mov_b32 s0, s3
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v4
; GFX12-CU-NEXT: v_add_co_u32 v0, s1, s1, v0
; GFX12-CU-NEXT: v_add_co_ci_u32_e64 v3, s0, s0, v1, s1
@@ -896,6 +920,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -914,6 +939,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX12-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index c7826181cc8dd..23982f8a00cdb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-WGP-LABEL: flat_wavefront_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX12-CU-LABEL: flat_wavefront_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX12-CU-LABEL: flat_wavefront_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -534,6 +538,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-WGP-LABEL: flat_wavefront_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -549,6 +554,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX12-CU-LABEL: flat_wavefront_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -719,6 +725,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -734,6 +741,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1438,6 +1446,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1450,6 +1459,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1590,6 +1600,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1602,6 +1613,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1742,6 +1754,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1754,6 +1767,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1894,6 +1908,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1906,6 +1921,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2046,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2058,6 +2075,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2823,6 +2841,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -2839,6 +2858,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3054,6 +3074,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3070,6 +3091,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3285,6 +3307,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3301,6 +3324,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3516,6 +3540,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3532,6 +3557,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3747,6 +3773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3763,6 +3790,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3978,6 +4006,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3994,6 +4023,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4209,6 +4239,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4225,6 +4256,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4440,6 +4472,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4456,6 +4489,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4671,6 +4705,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4687,6 +4722,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4902,6 +4938,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4918,6 +4955,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5133,6 +5171,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5149,6 +5188,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5364,6 +5404,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5380,6 +5421,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5595,6 +5637,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5611,6 +5654,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5826,6 +5870,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5842,6 +5887,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6057,6 +6103,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6073,6 +6120,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6320,6 +6368,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6340,6 +6389,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6593,6 +6643,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6613,6 +6664,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6866,6 +6918,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6886,6 +6939,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7139,6 +7193,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7159,6 +7214,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7412,6 +7468,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7432,6 +7489,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7685,6 +7743,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7705,6 +7764,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7958,6 +8018,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7978,6 +8039,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8231,6 +8293,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8251,6 +8314,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8504,6 +8568,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8524,6 +8589,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8777,6 +8843,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8797,6 +8864,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9050,6 +9118,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9070,6 +9139,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9323,6 +9393,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9343,6 +9414,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9596,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9616,6 +9689,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9869,6 +9943,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9889,6 +9964,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10142,6 +10218,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10162,6 +10239,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10339,6 +10417,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10354,6 +10433,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10524,6 +10604,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10539,6 +10620,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10709,6 +10791,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10724,6 +10807,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10894,6 +10978,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10909,6 +10994,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11613,6 +11699,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11625,6 +11712,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11765,6 +11853,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11777,6 +11866,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11917,6 +12007,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11929,6 +12020,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12069,6 +12161,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12081,6 +12174,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12221,6 +12315,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12233,6 +12328,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12998,6 +13094,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13014,6 +13111,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13229,6 +13327,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13245,6 +13344,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13460,6 +13560,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13476,6 +13577,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13691,6 +13793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13707,6 +13810,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13922,6 +14026,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13938,6 +14043,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14153,6 +14259,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14169,6 +14276,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14384,6 +14492,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14400,6 +14509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14615,6 +14725,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14631,6 +14742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14846,6 +14958,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14862,6 +14975,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15077,6 +15191,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15093,6 +15208,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15308,6 +15424,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15324,6 +15441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15539,6 +15657,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15555,6 +15674,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15770,6 +15890,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15786,6 +15907,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16001,6 +16123,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16017,6 +16140,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16232,6 +16356,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16248,6 +16373,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16495,6 +16621,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16515,6 +16642,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16768,6 +16896,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16788,6 +16917,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17041,6 +17171,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17061,6 +17192,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17314,6 +17446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17334,6 +17467,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17587,6 +17721,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17607,6 +17742,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17860,6 +17996,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17880,6 +18017,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18133,6 +18271,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18153,6 +18292,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18406,6 +18546,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18426,6 +18567,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18679,6 +18821,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18699,6 +18842,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18952,6 +19096,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18972,6 +19117,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19225,6 +19371,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19245,6 +19392,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19498,6 +19646,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19518,6 +19667,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19771,6 +19921,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19791,6 +19942,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20044,6 +20196,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20064,6 +20217,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 8949e4b782f63..cd2c8176b8d33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -164,6 +164,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -179,6 +180,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX12-CU-LABEL: flat_workgroup_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -349,6 +351,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -364,6 +367,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -544,6 +548,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -562,6 +567,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX12-CU-LABEL: flat_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -755,6 +761,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -777,6 +784,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1519,6 +1527,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1531,6 +1540,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1687,6 +1697,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1701,6 +1712,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -1854,6 +1866,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -1871,6 +1884,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2040,6 +2054,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2059,6 +2074,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -2229,6 +2245,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -2248,6 +2265,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -3093,6 +3111,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3109,6 +3128,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3340,6 +3360,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3358,6 +3379,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3586,6 +3608,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3607,6 +3630,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3851,6 +3875,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -3874,6 +3899,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4119,6 +4145,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4142,6 +4169,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4375,6 +4403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4393,6 +4422,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4625,6 +4655,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4643,6 +4674,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4887,6 +4919,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -4910,6 +4943,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5155,6 +5189,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5178,6 +5213,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5423,6 +5459,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5446,6 +5483,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5691,6 +5729,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5714,6 +5753,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5963,6 +6003,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -5983,6 +6024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6246,6 +6288,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6269,6 +6312,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6535,6 +6579,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6560,6 +6605,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6836,6 +6882,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -6864,6 +6911,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7141,6 +7189,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7169,6 +7218,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7434,6 +7484,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7457,6 +7508,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7721,6 +7773,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -7744,6 +7797,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8020,6 +8074,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8048,6 +8103,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8325,6 +8381,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8353,6 +8410,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8630,6 +8688,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8658,6 +8717,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8935,6 +8995,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -8963,6 +9024,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9240,6 +9302,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9268,6 +9331,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9545,6 +9609,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9573,6 +9638,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9850,6 +9916,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -9878,6 +9945,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10155,6 +10223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10183,6 +10252,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -10362,6 +10432,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10377,6 +10448,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10547,6 +10619,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10562,6 +10635,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10738,6 +10812,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10757,6 +10832,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -10939,6 +11015,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -10962,6 +11039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11688,6 +11766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11700,6 +11779,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -11848,6 +11928,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -11862,6 +11943,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12008,6 +12090,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12025,6 +12108,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12179,6 +12263,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12198,6 +12283,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -12352,6 +12438,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -12371,6 +12458,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -13188,6 +13276,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13204,6 +13293,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13427,6 +13517,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13445,6 +13536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13666,6 +13758,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13687,6 +13780,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13916,6 +14010,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -13939,6 +14034,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14168,6 +14264,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14191,6 +14288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14414,6 +14512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14432,6 +14531,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14655,6 +14755,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14673,6 +14774,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14902,6 +15004,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -14925,6 +15028,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15154,6 +15258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15177,6 +15282,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15406,6 +15512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15429,6 +15536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15658,6 +15766,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15681,6 +15790,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15910,6 +16020,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -15933,6 +16044,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16162,6 +16274,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16185,6 +16298,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16414,6 +16528,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16437,6 +16552,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16666,6 +16782,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16689,6 +16806,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16936,6 +17054,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -16956,6 +17075,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17215,6 +17335,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17239,6 +17360,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17498,6 +17620,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17523,6 +17646,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17788,6 +17912,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -17817,6 +17942,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18082,6 +18208,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18111,6 +18238,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18370,6 +18498,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18394,6 +18523,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18653,6 +18783,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18677,6 +18808,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18942,6 +19074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -18971,6 +19104,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19236,6 +19370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19265,6 +19400,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19530,6 +19666,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19559,6 +19696,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19824,6 +19962,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -19853,6 +19992,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20118,6 +20258,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20147,6 +20288,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20412,6 +20554,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20441,6 +20584,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20706,6 +20850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -20735,6 +20880,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21000,6 +21146,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
@@ -21029,6 +21176,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b56860991b194..4ba64af63e5f5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-WGP-LABEL: global_agent_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX12-CU-LABEL: global_agent_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-WGP-LABEL: global_agent_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX12-CU-LABEL: global_agent_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -588,6 +592,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-WGP-LABEL: global_agent_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -603,6 +608,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX12-CU-LABEL: global_agent_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -811,6 +817,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-WGP-LABEL: global_agent_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -830,6 +837,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-LABEL: global_agent_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -995,6 +1003,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-WGP-LABEL: global_agent_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1006,6 +1015,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-LABEL: global_agent_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1162,6 +1172,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-WGP-LABEL: global_agent_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1173,6 +1184,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-LABEL: global_agent_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1346,6 +1358,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-WGP-LABEL: global_agent_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1362,6 +1375,7 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-LABEL: global_agent_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1540,6 +1554,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-WGP-LABEL: global_agent_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1556,6 +1571,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-LABEL: global_agent_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3395,6 +3411,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3410,6 +3427,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3644,6 +3662,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3661,6 +3680,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3889,6 +3909,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3909,6 +3930,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4165,6 +4187,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4210,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4445,6 +4469,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4467,6 +4492,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4708,6 +4734,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4725,6 +4752,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4961,6 +4989,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4978,6 +5007,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5231,6 +5261,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5253,6 +5284,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5511,6 +5543,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5533,6 +5566,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5791,6 +5825,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5813,6 +5848,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6071,6 +6107,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6093,6 +6130,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6351,6 +6389,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6373,6 +6412,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6631,6 +6671,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6653,6 +6694,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6911,6 +6953,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6933,6 +6976,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7191,6 +7235,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7213,6 +7258,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7454,6 +7500,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7471,6 +7518,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7724,6 +7772,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7793,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8002,6 +8052,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8024,6 +8075,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8299,6 +8351,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8324,6 +8377,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8602,6 +8656,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8627,6 +8682,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8888,6 +8944,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8908,6 +8965,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9164,6 +9222,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9184,6 +9243,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9457,6 +9517,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9482,6 +9543,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9760,6 +9822,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9785,6 +9848,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10063,6 +10127,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10088,6 +10153,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10366,6 +10432,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10391,6 +10458,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10669,6 +10737,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10694,6 +10763,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10972,6 +11042,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10997,6 +11068,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11275,6 +11347,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11300,6 +11373,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11578,6 +11652,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11603,6 +11678,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11802,6 +11878,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-WGP-LABEL: global_agent_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11814,6 +11891,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX12-CU-LABEL: global_agent_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11998,6 +12076,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12010,6 +12089,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12208,6 +12288,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12223,6 +12304,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX12-CU-LABEL: global_agent_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12431,6 +12513,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12450,6 +12533,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -12615,6 +12699,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-WGP-LABEL: global_agent_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12626,6 +12711,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-LABEL: global_agent_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12782,6 +12868,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12793,6 +12880,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12966,6 +13054,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-WGP-LABEL: global_agent_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12982,6 +13071,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-LABEL: global_agent_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13160,6 +13250,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -13176,6 +13267,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -15015,6 +15107,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15030,6 +15123,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15264,6 +15358,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15281,6 +15376,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15509,6 +15605,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15529,6 +15626,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15785,6 +15883,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15807,6 +15906,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16065,6 +16165,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16087,6 +16188,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16328,6 +16430,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16345,6 +16448,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16581,6 +16685,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16598,6 +16703,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16851,6 +16957,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16873,6 +16980,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17131,6 +17239,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17153,6 +17262,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17521,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17433,6 +17544,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17691,6 +17803,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17713,6 +17826,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17971,6 +18085,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17993,6 +18108,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18251,6 +18367,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18273,6 +18390,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18531,6 +18649,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18553,6 +18672,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18811,6 +18931,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18833,6 +18954,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19074,6 +19196,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19091,6 +19214,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19344,6 +19468,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19364,6 +19489,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19637,6 +19763,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19662,6 +19789,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19940,6 +20068,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19965,6 +20094,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20226,6 +20356,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20246,6 +20377,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20502,6 +20634,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20522,6 +20655,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20795,6 +20929,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20820,6 +20955,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21098,6 +21234,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21123,6 +21260,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21401,6 +21539,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21426,6 +21565,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21704,6 +21844,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21729,6 +21870,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22007,6 +22149,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22032,6 +22175,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22310,6 +22454,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22335,6 +22480,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22613,6 +22759,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22638,6 +22785,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22916,6 +23064,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22941,6 +23090,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index 7a9cb992a0cd1..0fc3212b0f46d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: global_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -25,13 +26,16 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: global_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-NEXT: s_mov_b32 s4, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
@@ -50,6 +54,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-LABEL: global_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -70,13 +75,16 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
; GFX12-LABEL: global_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-NEXT: s_mov_b32 s4, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9ba..14f1734235673 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -178,6 +178,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-WGP-LABEL: global_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -191,6 +192,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX12-CU-LABEL: global_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -437,13 +439,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-WGP-LABEL: global_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-WGP-NEXT: s_mov_b32 s4, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -454,13 +459,16 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX12-CU-LABEL: global_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-CU-NEXT: s_mov_b32 s4, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_NT
@@ -641,6 +649,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-WGP-LABEL: global_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -654,6 +663,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX12-CU-LABEL: global_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -881,13 +891,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-WGP-LABEL: global_nontemporal_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
@@ -897,13 +910,16 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX12-CU-LABEL: global_nontemporal_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
@@ -1087,6 +1103,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-WGP-LABEL: global_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -1101,6 +1118,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-LABEL: global_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67..33aaeebf658dd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-WGP-LABEL: global_singlethread_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX12-CU-LABEL: global_singlethread_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-WGP-LABEL: global_singlethread_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX12-CU-LABEL: global_singlethread_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-WGP-LABEL: global_singlethread_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX12-CU-LABEL: global_singlethread_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX12-CU-LABEL: global_singlethread_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-WGP-LABEL: global_singlethread_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-LABEL: global_singlethread_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-WGP-LABEL: global_singlethread_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-LABEL: global_singlethread_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-WGP-LABEL: global_singlethread_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-LABEL: global_singlethread_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-LABEL: global_singlethread_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX12-CU-LABEL: global_singlethread_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-LABEL: global_singlethread_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-LABEL: global_singlethread_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 62a4f3b43b2dc..2c877755019ce 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-WGP-LABEL: global_system_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX12-CU-LABEL: global_system_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-WGP-LABEL: global_system_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX12-CU-LABEL: global_system_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -590,6 +594,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-WGP-LABEL: global_system_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -605,6 +610,7 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX12-CU-LABEL: global_system_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -815,6 +821,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-WGP-LABEL: global_system_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -834,6 +841,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-LABEL: global_system_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -999,6 +1007,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-WGP-LABEL: global_system_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1010,6 +1019,7 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-LABEL: global_system_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1166,6 +1176,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-WGP-LABEL: global_system_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1177,6 +1188,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-LABEL: global_system_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1352,6 +1364,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-WGP-LABEL: global_system_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1368,6 +1381,7 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-LABEL: global_system_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1548,6 +1562,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-WGP-LABEL: global_system_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1564,6 +1579,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-LABEL: global_system_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3425,6 +3441,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3440,6 +3457,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3694,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3693,6 +3712,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3923,6 +3943,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3943,6 +3964,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4203,6 +4225,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4225,6 +4248,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4487,6 +4511,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4509,6 +4534,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4752,6 +4778,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4769,6 +4796,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5007,6 +5035,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5024,6 +5053,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5281,6 +5311,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5303,6 +5334,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5565,6 +5597,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5587,6 +5620,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5849,6 +5883,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5871,6 +5906,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6133,6 +6169,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6192,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6396,6 +6434,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6413,6 +6452,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6668,6 +6708,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6688,6 +6729,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6965,6 +7007,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6990,6 +7033,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7272,6 +7316,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7297,6 +7342,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7560,6 +7606,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7580,6 +7627,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7838,6 +7886,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7858,6 +7907,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8135,6 +8185,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8160,6 +8211,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8442,6 +8494,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8467,6 +8520,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8749,6 +8803,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8774,6 +8829,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9056,6 +9112,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9081,6 +9138,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9363,6 +9421,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9388,6 +9447,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9670,6 +9730,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9695,6 +9756,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9977,6 +10039,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10002,6 +10065,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10284,6 +10348,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10309,6 +10374,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10508,6 +10574,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-WGP-LABEL: global_system_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10520,6 +10587,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX12-CU-LABEL: global_system_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10704,6 +10772,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10716,6 +10785,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX12-CU-LABEL: global_system_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10916,6 +10986,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-WGP-LABEL: global_system_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10931,6 +11002,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX12-CU-LABEL: global_system_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11141,6 +11213,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11160,6 +11233,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11325,6 +11399,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-WGP-LABEL: global_system_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11336,6 +11411,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-LABEL: global_system_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11492,6 +11568,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11503,6 +11580,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-LABEL: global_system_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11678,6 +11756,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-WGP-LABEL: global_system_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11694,6 +11773,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-LABEL: global_system_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11874,6 +11954,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11890,6 +11971,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13751,6 +13833,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13766,6 +13849,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14002,6 +14086,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14019,6 +14104,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14249,6 +14335,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14269,6 +14356,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14529,6 +14617,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14551,6 +14640,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14835,6 +14926,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15078,6 +15170,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15095,6 +15188,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15333,6 +15427,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15350,6 +15445,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15607,6 +15703,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15629,6 +15726,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15891,6 +15989,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15913,6 +16012,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16175,6 +16275,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16197,6 +16298,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16459,6 +16561,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16481,6 +16584,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16743,6 +16847,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16765,6 +16870,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17027,6 +17133,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17049,6 +17156,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17311,6 +17419,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17333,6 +17442,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17595,6 +17705,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17617,6 +17728,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17858,6 +17970,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17875,6 +17988,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18130,6 +18244,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18150,6 +18265,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18410,6 +18526,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18432,6 +18549,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18829,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18736,6 +18855,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19018,6 +19138,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19043,6 +19164,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19306,6 +19428,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19326,6 +19449,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19584,6 +19708,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19604,6 +19729,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19881,6 +20007,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19906,6 +20033,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20188,6 +20316,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20213,6 +20342,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20495,6 +20625,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20520,6 +20651,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20802,6 +20934,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20827,6 +20960,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21109,6 +21243,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21134,6 +21269,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21416,6 +21552,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21441,6 +21578,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21723,6 +21861,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21748,6 +21887,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22030,6 +22170,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -22055,6 +22196,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index a98efb49b4b72..692aee5f4b9ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -126,6 +126,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-WGP-LABEL: global_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -140,6 +141,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-LABEL: global_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -315,13 +317,16 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-WGP-LABEL: global_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-WGP-NEXT: s_mov_b32 s4, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -334,13 +339,16 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-LABEL: global_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s4
; GFX12-CU-NEXT: s_mov_b32 s4, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s4, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS
@@ -474,6 +482,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-WGP-LABEL: global_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX12-CU-LABEL: global_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -665,13 +675,16 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-WGP-LABEL: global_volatile_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
@@ -687,13 +700,16 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX12-CU-LABEL: global_volatile_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v0, s3, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
@@ -833,6 +849,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-WGP-LABEL: global_volatile_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -848,6 +865,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX12-CU-LABEL: global_volatile_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -967,6 +985,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-WGP-LABEL: global_volatile_workgroup_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -983,6 +1002,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-LABEL: global_volatile_workgroup_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006..aaa11c0455606 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-WGP-LABEL: global_wavefront_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX12-CU-LABEL: global_wavefront_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-WGP-LABEL: global_wavefront_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX12-CU-LABEL: global_wavefront_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -574,6 +578,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-WGP-LABEL: global_wavefront_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -586,6 +591,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX12-CU-LABEL: global_wavefront_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -770,6 +776,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -782,6 +789,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX12-CU-LABEL: global_wavefront_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -940,6 +948,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-WGP-LABEL: global_wavefront_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -951,6 +960,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-LABEL: global_wavefront_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1107,6 +1117,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-WGP-LABEL: global_wavefront_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1118,6 +1129,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-LABEL: global_wavefront_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1274,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-WGP-LABEL: global_wavefront_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1285,6 +1298,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-LABEL: global_wavefront_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1441,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1452,6 +1467,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-LABEL: global_wavefront_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3004,6 +3020,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3019,6 +3036,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3228,6 +3246,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3243,6 +3262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3452,6 +3472,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3467,6 +3488,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3676,6 +3698,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3691,6 +3714,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3900,6 +3924,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3915,6 +3940,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4124,6 +4150,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4139,6 +4166,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4348,6 +4376,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4363,6 +4392,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4572,6 +4602,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4587,6 +4618,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4796,6 +4828,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4811,6 +4844,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5020,6 +5054,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5035,6 +5070,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5244,6 +5280,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5259,6 +5296,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5468,6 +5506,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5483,6 +5522,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5692,6 +5732,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5707,6 +5748,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5916,6 +5958,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5931,6 +5974,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6140,6 +6184,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6155,6 +6200,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6389,6 +6435,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6406,6 +6453,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6644,6 +6692,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6661,6 +6710,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6899,6 +6949,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6916,6 +6967,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7206,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7171,6 +7224,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7409,6 +7463,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7426,6 +7481,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7664,6 +7720,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7681,6 +7738,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7919,6 +7977,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7936,6 +7995,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8174,6 +8234,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8191,6 +8252,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8429,6 +8491,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8446,6 +8509,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8684,6 +8748,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8701,6 +8766,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8939,6 +9005,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8956,6 +9023,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9194,6 +9262,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9211,6 +9280,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9449,6 +9519,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9466,6 +9537,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9704,6 +9776,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9721,6 +9794,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9959,6 +10033,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9976,6 +10051,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10167,6 +10243,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10179,6 +10256,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX12-CU-LABEL: global_wavefront_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10363,6 +10441,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10375,6 +10454,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10559,6 +10639,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10571,6 +10652,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10755,6 +10837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10767,6 +10850,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -10925,6 +11009,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -10936,6 +11021,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-LABEL: global_wavefront_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11092,6 +11178,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11103,6 +11190,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11259,6 +11347,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11270,6 +11359,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-LABEL: global_wavefront_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11426,6 +11516,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11437,6 +11528,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12989,6 +13081,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13004,6 +13097,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13213,6 +13307,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13228,6 +13323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13437,6 +13533,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13452,6 +13549,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13661,6 +13759,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13676,6 +13775,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13885,6 +13985,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13900,6 +14001,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14109,6 +14211,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14124,6 +14227,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14333,6 +14437,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14348,6 +14453,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14557,6 +14663,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14572,6 +14679,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14781,6 +14889,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14796,6 +14905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15005,6 +15115,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15020,6 +15131,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15229,6 +15341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15244,6 +15357,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15453,6 +15567,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15468,6 +15583,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15677,6 +15793,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15692,6 +15809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15901,6 +16019,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16035,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16125,6 +16245,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16140,6 +16261,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16374,6 +16496,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16391,6 +16514,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16753,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16646,6 +16771,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16884,6 +17010,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16901,6 +17028,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17139,6 +17267,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17156,6 +17285,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17394,6 +17524,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17411,6 +17542,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17649,6 +17781,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17666,6 +17799,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17904,6 +18038,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17921,6 +18056,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18159,6 +18295,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18176,6 +18313,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18414,6 +18552,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18431,6 +18570,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18669,6 +18809,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18924,6 +19066,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18941,6 +19084,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19179,6 +19323,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19196,6 +19341,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19434,6 +19580,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19451,6 +19598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19689,6 +19837,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19706,6 +19855,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19944,6 +20094,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19961,6 +20112,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 30bf492071535..25c75aa50df09 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -182,6 +182,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-WGP-LABEL: global_workgroup_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -194,6 +195,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX12-CU-LABEL: global_workgroup_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -378,6 +380,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-WGP-LABEL: global_workgroup_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -390,6 +393,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX12-CU-LABEL: global_workgroup_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -578,6 +582,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-WGP-LABEL: global_workgroup_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -593,6 +598,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX12-CU-LABEL: global_workgroup_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -786,6 +792,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -805,6 +812,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-LABEL: global_workgroup_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -964,6 +972,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-WGP-LABEL: global_workgroup_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -975,6 +984,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-LABEL: global_workgroup_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1131,6 +1141,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-WGP-LABEL: global_workgroup_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1142,6 +1153,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-LABEL: global_workgroup_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1311,6 +1323,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-WGP-LABEL: global_workgroup_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1327,6 +1340,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-LABEL: global_workgroup_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -1497,6 +1511,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1513,6 +1528,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-LABEL: global_workgroup_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -3212,6 +3228,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3227,6 +3244,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3444,6 +3462,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3461,6 +3480,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3683,6 +3703,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3703,6 +3724,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3934,6 +3956,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -3956,6 +3979,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4187,6 +4211,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4209,6 +4234,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4427,6 +4453,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4444,6 +4471,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4661,6 +4689,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4678,6 +4707,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4908,6 +4938,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -4930,6 +4961,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5161,6 +5193,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5183,6 +5216,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5414,6 +5448,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5436,6 +5471,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5667,6 +5703,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5689,6 +5726,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5920,6 +5958,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -5942,6 +5981,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6173,6 +6213,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6195,6 +6236,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6426,6 +6468,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6448,6 +6491,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6679,6 +6723,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6701,6 +6746,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6936,6 +6982,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -6953,6 +7000,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7195,6 +7243,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7215,6 +7264,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7466,6 +7516,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7488,6 +7539,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7744,6 +7796,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -7769,6 +7822,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8025,6 +8079,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8050,6 +8105,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8293,6 +8349,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8313,6 +8370,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8555,6 +8613,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8575,6 +8634,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8830,6 +8890,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -8855,6 +8916,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9111,6 +9173,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9136,6 +9199,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9392,6 +9456,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9417,6 +9482,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9673,6 +9739,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9698,6 +9765,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9954,6 +10022,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -9979,6 +10048,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10235,6 +10305,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10260,6 +10331,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10516,6 +10588,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10541,6 +10614,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10797,6 +10871,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -10822,6 +10897,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -11014,6 +11090,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11026,6 +11103,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX12-CU-LABEL: global_workgroup_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11210,6 +11288,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11222,6 +11301,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11410,6 +11490,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11425,6 +11506,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11615,6 +11697,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11634,6 +11717,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
@@ -11792,6 +11876,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11803,6 +11888,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-LABEL: global_workgroup_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -11959,6 +12045,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -11970,6 +12057,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12132,6 +12220,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12148,6 +12237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-LABEL: global_workgroup_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -12310,6 +12400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -12326,6 +12417,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -13984,6 +14076,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -13999,6 +14092,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14216,6 +14310,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14233,6 +14328,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14448,6 +14544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14468,6 +14565,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14691,6 +14789,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14713,6 +14812,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14936,6 +15036,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -14958,6 +15059,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15175,6 +15277,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15192,6 +15295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15409,6 +15513,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15426,6 +15531,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15649,6 +15755,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15671,6 +15778,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15894,6 +16002,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -15916,6 +16025,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16139,6 +16249,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16161,6 +16272,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16384,6 +16496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16406,6 +16519,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16629,6 +16743,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16651,6 +16766,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16874,6 +16990,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -16896,6 +17013,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17119,6 +17237,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17141,6 +17260,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17364,6 +17484,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17386,6 +17507,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17620,6 +17742,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17637,6 +17760,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17879,6 +18003,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -17899,6 +18024,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18143,6 +18269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18165,6 +18292,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18413,6 +18541,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18438,6 +18567,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18686,6 +18816,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18711,6 +18842,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18953,6 +19085,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -18973,6 +19106,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19215,6 +19349,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19235,6 +19370,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19483,6 +19619,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19508,6 +19645,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19756,6 +19894,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -19781,6 +19920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20029,6 +20169,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20054,6 +20195,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20302,6 +20444,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20327,6 +20470,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20575,6 +20719,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20600,6 +20745,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20848,6 +20994,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -20873,6 +21020,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21121,6 +21269,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21146,6 +21295,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21394,6 +21544,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
@@ -21419,6 +21570,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
index 67ca31a2bb84e..1a2058cbe39e4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s 2>&1 | FileCheck %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s 2>&1 | FileCheck %s
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 02cd97c9fe82a..d925ca52f8560 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index ba9711333a194..fce60ff12aed3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -183,6 +183,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-WGP-LABEL: local_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -196,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX12-CU-LABEL: local_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -420,13 +422,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-WGP-LABEL: local_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
@@ -437,13 +442,16 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX12-CU-LABEL: local_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-CU-NEXT: ds_load_b32 v1, v1
@@ -615,6 +623,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-WGP-LABEL: local_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -628,6 +637,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX12-CU-LABEL: local_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -826,8 +836,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-WGP-NEXT: s_mov_b32 s1, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
@@ -841,8 +853,10 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-CU-NEXT: s_mov_b32 s1, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
@@ -1027,6 +1041,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1040,6 +1055,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
; GFX12-CU-LABEL: local_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index fe5f2c51734f7..033c71574643c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX12-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX12-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX12-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX12-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1c4c8d41b18f9..548c5aceb25f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a52dd9b340169..a8f7051bd5050 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -123,6 +123,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-WGP-LABEL: local_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -136,6 +137,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX12-CU-LABEL: local_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -284,13 +286,16 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-WGP-LABEL: local_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
@@ -301,13 +306,16 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX12-CU-LABEL: local_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s3, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3
; GFX12-CU-NEXT: ds_load_b32 v1, v1
@@ -423,6 +431,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-LABEL: local_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -441,6 +450,7 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-LABEL: local_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -576,8 +586,10 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-WGP-NEXT: s_mov_b32 s1, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
@@ -596,8 +608,10 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1
; GFX12-CU-NEXT: s_mov_b32 s1, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 02e4e0d69dc20..694ffb2964f56 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -2657,6 +2657,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2670,6 +2671,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2837,6 +2839,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2850,6 +2853,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3017,6 +3021,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3030,6 +3035,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3197,6 +3203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3210,6 +3217,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3377,6 +3385,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3390,6 +3399,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3557,6 +3567,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3570,6 +3581,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3737,6 +3749,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3750,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3917,6 +3931,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3930,6 +3945,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4097,6 +4113,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4127,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4277,6 +4295,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4290,6 +4309,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4470,6 +4491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4659,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4650,6 +4673,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4817,6 +4841,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4830,6 +4855,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4997,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5010,6 +5037,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5177,6 +5205,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5190,6 +5219,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5390,6 +5420,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5406,6 +5437,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5611,6 +5643,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5627,6 +5660,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5832,6 +5866,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -5848,6 +5883,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6053,6 +6089,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6069,6 +6106,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6274,6 +6312,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6290,6 +6329,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6495,6 +6535,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6511,6 +6552,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6716,6 +6758,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6732,6 +6775,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6937,6 +6981,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6953,6 +6998,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7158,6 +7204,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7174,6 +7221,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7379,6 +7427,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7395,6 +7444,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7600,6 +7650,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7616,6 +7667,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7821,6 +7873,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7837,6 +7890,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8042,6 +8096,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8058,6 +8113,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8263,6 +8319,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8279,6 +8336,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8484,6 +8542,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8500,6 +8559,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -11165,6 +11225,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11178,6 +11239,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11345,6 +11407,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11358,6 +11421,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11525,6 +11589,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11538,6 +11603,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11705,6 +11771,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11718,6 +11785,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11885,6 +11953,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -11898,6 +11967,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12065,6 +12135,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12078,6 +12149,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12245,6 +12317,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12258,6 +12331,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12425,6 +12499,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12438,6 +12513,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12605,6 +12681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12618,6 +12695,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12785,6 +12863,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12798,6 +12877,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12965,6 +13045,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12978,6 +13059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13145,6 +13227,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13158,6 +13241,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13325,6 +13409,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13338,6 +13423,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13505,6 +13591,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13518,6 +13605,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13685,6 +13773,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13698,6 +13787,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13898,6 +13988,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -13914,6 +14005,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14119,6 +14211,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14135,6 +14228,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14340,6 +14434,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14356,6 +14451,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14561,6 +14657,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14577,6 +14674,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14782,6 +14880,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14798,6 +14897,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15003,6 +15103,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15019,6 +15120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15224,6 +15326,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15240,6 +15343,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15445,6 +15549,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15461,6 +15566,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15666,6 +15772,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15682,6 +15789,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15887,6 +15995,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15903,6 +16012,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16108,6 +16218,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16124,6 +16235,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16329,6 +16441,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16345,6 +16458,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16550,6 +16664,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16566,6 +16681,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16771,6 +16887,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16904,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16992,6 +17110,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17008,6 +17127,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index c242963228537..0cf644c006fac 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -2881,6 +2881,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -2894,6 +2895,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3074,6 +3076,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3089,6 +3092,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3270,6 +3274,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3288,6 +3293,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3482,6 +3488,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3502,6 +3509,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3697,6 +3705,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3717,6 +3726,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3899,6 +3909,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -3914,6 +3925,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4095,6 +4107,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4110,6 +4123,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4304,6 +4318,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4324,6 +4339,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4519,6 +4535,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4539,6 +4556,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4734,6 +4752,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4754,6 +4773,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4949,6 +4969,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -4969,6 +4990,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5164,6 +5186,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5184,6 +5207,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5379,6 +5403,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5399,6 +5424,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5594,6 +5620,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5614,6 +5641,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5809,6 +5837,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -5829,6 +5858,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -6031,6 +6061,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6047,6 +6078,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6256,6 +6288,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6273,6 +6306,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6491,6 +6525,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6512,6 +6547,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6735,6 +6771,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6757,6 +6794,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -6980,6 +7018,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7002,6 +7041,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7212,6 +7252,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7229,6 +7270,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7438,6 +7480,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7455,6 +7498,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7677,6 +7721,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7699,6 +7744,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7922,6 +7968,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -7944,6 +7991,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8167,6 +8215,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8189,6 +8238,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8412,6 +8462,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8434,6 +8485,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8657,6 +8709,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8679,6 +8732,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8902,6 +8956,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -8924,6 +8979,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9147,6 +9203,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9169,6 +9226,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9392,6 +9450,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -9414,6 +9473,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -12080,6 +12140,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12093,6 +12154,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12260,6 +12322,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12273,6 +12336,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12440,6 +12504,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12453,6 +12518,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12620,6 +12686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12633,6 +12700,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12800,6 +12868,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12813,6 +12882,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12980,6 +13050,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -12993,6 +13064,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13160,6 +13232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13173,6 +13246,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13340,6 +13414,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13353,6 +13428,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13520,6 +13596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13533,6 +13610,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13700,6 +13778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13713,6 +13792,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13880,6 +13960,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -13893,6 +13974,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14060,6 +14142,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14073,6 +14156,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14240,6 +14324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14253,6 +14338,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14420,6 +14506,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14433,6 +14520,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14600,6 +14688,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14613,6 +14702,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
@@ -14813,6 +14903,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -14829,6 +14920,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15034,6 +15126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15050,6 +15143,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15255,6 +15349,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15271,6 +15366,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15476,6 +15572,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15492,6 +15589,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15697,6 +15795,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15713,6 +15812,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15918,6 +16018,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -15934,6 +16035,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16139,6 +16241,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16155,6 +16258,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16360,6 +16464,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16376,6 +16481,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16581,6 +16687,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16597,6 +16704,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16802,6 +16910,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -16818,6 +16927,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17023,6 +17133,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17039,6 +17150,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17244,6 +17356,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17260,6 +17373,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17579,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17481,6 +17596,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17686,6 +17802,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17702,6 +17819,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17907,6 +18025,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
@@ -17923,6 +18042,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 61cec731feb56..8e292fa592975 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add
; GFX12-LABEL: private_last_use_load_0:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -24,13 +25,16 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
; GFX12-LABEL: private_last_use_load_1:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v1, v0
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-NEXT: s_mov_b32 s3, 2
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU
@@ -49,6 +53,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %
; GFX12-LABEL: private_last_use_and_volatile_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -69,6 +74,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5
; GFX12-LABEL: private_last_use_and_nontemporal_load:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 4e08065e879fd..c3599c87985be 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -193,6 +193,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-WGP-LABEL: private_nontemporal_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -205,6 +206,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX12-CU-LABEL: private_nontemporal_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -442,13 +444,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-WGP-LABEL: private_nontemporal_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -459,13 +464,16 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX12-CU-LABEL: private_nontemporal_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
@@ -648,6 +656,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-WGP-LABEL: private_nontemporal_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -660,6 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX12-CU-LABEL: private_nontemporal_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -868,13 +878,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-WGP-LABEL: private_nontemporal_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
@@ -884,13 +897,16 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX12-CU-LABEL: private_nontemporal_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
@@ -1085,6 +1101,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -1099,6 +1116,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
; GFX12-CU-LABEL: private_nontemporal_volatile_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index a68b5f36b806e..9146f175eefcd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -135,6 +135,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-WGP-LABEL: private_volatile_load_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
@@ -149,6 +150,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX12-CU-LABEL: private_volatile_load_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
@@ -312,13 +314,16 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-WGP-LABEL: private_volatile_load_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX12-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -331,13 +336,16 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX12-CU-LABEL: private_volatile_load_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX12-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
; GFX12-CU-NEXT: s_mov_b32 s3, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
@@ -475,6 +483,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-WGP-LABEL: private_volatile_store_0:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
@@ -493,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX12-CU-LABEL: private_volatile_store_0:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
@@ -646,13 +656,16 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-WGP-LABEL: private_volatile_store_1:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-WGP-NEXT: s_mov_b32 s2, 2
+; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
@@ -668,13 +681,16 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX12-CU-LABEL: private_volatile_store_1:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
; GFX12-CU-NEXT: s_mov_b32 s2, 2
+; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 0889f8ef6316e..2e73f37458242 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2595,6 +2595,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX12-NEXT: s_branch .LBB16_2
; GFX12-NEXT: .LBB16_4:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: .LBB16_5: ; %endif
; GFX12-NEXT: s_mov_b32 s3, 0x31016000
@@ -2927,6 +2928,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: s_mov_b32 s14, s8
; GFX12-NEXT: s_mov_b32 s2, s9
; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13]
; GFX12-NEXT: s_mov_b32 s12, s23
; GFX12-NEXT: s_mov_b32 s16, s5
@@ -2943,10 +2945,13 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: s_mov_b32 s25, s6
; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19]
; GFX12-NEXT: s_mov_b32 s23, s13
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_mov_b32 s3, 0x31016000
; GFX12-NEXT: s_mov_b32 s2, -1
@@ -3354,6 +3359,7 @@ define i32 @mul_pow2_plus_1(i32 %val) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: mul_pow2_plus_1:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 5c09d2bd61a39..97d4f4696e827 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -44,6 +44,7 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 1
%load = load i8, ptr %gep, align 4
@@ -83,6 +84,7 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 2047
%load = load i8, ptr %gep, align 4
@@ -122,6 +124,7 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
@@ -165,6 +168,7 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
@@ -226,6 +230,7 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
@@ -287,6 +292,7 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -2048
%load = load i8, ptr %gep, align 4
@@ -330,6 +336,7 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
@@ -373,6 +380,7 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
@@ -416,6 +424,7 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8388608
%load = load i8, ptr %gep, align 4
@@ -456,6 +465,7 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
@@ -499,6 +509,7 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
@@ -560,6 +571,7 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
@@ -623,6 +635,7 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
@@ -654,6 +667,7 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 16777214
%load = load i8, ptr %gep, align 4
@@ -697,6 +711,7 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
@@ -740,6 +755,7 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
@@ -783,6 +799,7 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -16384
%load = load i8, ptr %gep, align 4
@@ -828,6 +845,7 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
@@ -841,6 +859,7 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -16777215
%load = load i8, ptr %gep, align 4
@@ -887,6 +906,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
@@ -938,13 +958,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936639
%load = load i8, ptr %gep, align 4
@@ -991,6 +1012,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
@@ -1042,13 +1064,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936640
%load = load i8, ptr %gep, align 4
@@ -1095,6 +1118,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
@@ -1146,13 +1170,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938687
%load = load i8, ptr %gep, align 4
@@ -1199,6 +1224,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
@@ -1250,13 +1276,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938688
%load = load i8, ptr %gep, align 4
@@ -1303,6 +1330,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
@@ -1354,13 +1382,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942783
%load = load i8, ptr %gep, align 4
@@ -1407,6 +1436,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
@@ -1458,13 +1488,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942784
%load = load i8, ptr %gep, align 4
@@ -1512,6 +1543,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1563,13 +1595,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773761
%load = load i8, ptr %gep, align 4
@@ -1617,6 +1650,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1668,13 +1702,14 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773760
%load = load i8, ptr %gep, align 4
@@ -1722,6 +1757,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1773,13 +1809,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771713
%load = load i8, ptr %gep, align 4
@@ -1827,6 +1864,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1878,13 +1916,14 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771712
%load = load i8, ptr %gep, align 4
@@ -1932,6 +1971,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -1983,13 +2023,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767617
%load = load i8, ptr %gep, align 4
@@ -2037,6 +2078,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -2088,13 +2130,14 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767616
%load = load i8, ptr %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index b5b8213bcd57e..b0d10aa24ce69 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -42,6 +42,7 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 1
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -79,6 +80,7 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -118,6 +120,7 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
@@ -170,6 +173,7 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
@@ -240,6 +244,7 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8388607
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
@@ -304,6 +309,7 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -343,6 +349,7 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -386,6 +393,7 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -429,6 +437,7 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -468,6 +477,7 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
@@ -520,6 +530,7 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
@@ -590,6 +601,7 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:16383
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
@@ -662,6 +674,7 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
@@ -702,6 +715,7 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -741,6 +755,7 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -784,6 +799,7 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -827,6 +843,7 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-16384
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -872,6 +889,7 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
@@ -912,6 +930,7 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -970,13 +989,14 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
@@ -1017,6 +1037,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1074,13 +1095,14 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
@@ -1121,6 +1143,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1178,13 +1201,14 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
@@ -1225,6 +1249,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1282,13 +1307,14 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
@@ -1329,6 +1355,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1386,13 +1413,14 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
@@ -1433,6 +1461,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1490,13 +1519,14 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000
; GFX12-GISEL-NEXT: s_mov_b32 s1, 2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
@@ -1537,6 +1567,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1594,13 +1625,14 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x7ff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1642,6 +1674,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1699,13 +1732,14 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x800
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1747,6 +1781,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1804,13 +1839,14 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0xfff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1852,6 +1888,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1909,13 +1946,14 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1000
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1957,6 +1995,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -2014,13 +2053,14 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x1fff
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -2062,6 +2102,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -2119,13 +2160,14 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x2000
; GFX12-GISEL-NEXT: s_brev_b32 s1, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -2167,6 +2209,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
%load = load i8, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index bbbbc0dc0f28d..fa8be7ee1b33e 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -7,12 +7,15 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_exp_f32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX12-NEXT: s_mul_f32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.exp2.f32(float %src)
@@ -57,12 +60,15 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_log_f32 s0, s0
-; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX12-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.log2.f32(float %src)
@@ -163,32 +169,36 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mul_f32 s1, s0, 0x4f800000
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1
; GFX12-SDAG-NEXT: s_mov_b32 s4, s1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_fmac_f32 s4, s5, s2
; GFX12-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cmp_le_f32 s4, 0
; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s2
; GFX12-SDAG-NEXT: s_add_co_i32 s4, s2, 1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_xor_b32 s6, s4, 0x80000000
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX12-SDAG-NEXT: s_fmac_f32 s5, s6, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s5, 0
; GFX12-SDAG-NEXT: s_cselect_b32 s2, s4, s3
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
; GFX12-SDAG-NEXT: s_mul_f32 s0, s2, 0x37800000
; GFX12-SDAG-NEXT: v_cmp_class_f32_e64 s3, s1, 0x260
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s0, s2
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_and_b32 s2, s3, exec_lo
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_cselect_b32 s0, s1, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
@@ -197,28 +207,29 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_cmp_gt_f32 0xf800000, s0
; GFX12-GISEL-NEXT: s_mul_f32 s2, s0, 0x4f800000
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0
; GFX12-GISEL-NEXT: s_mov_b32 s4, s0
; GFX12-GISEL-NEXT: s_mov_b32 s6, s0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_add_co_i32 s3, s2, -1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_xor_b32 s5, s3, 0x80000000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_fmac_f32 s4, s5, s2
; GFX12-GISEL-NEXT: s_add_co_i32 s5, s2, 1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_xor_b32 s7, s5, 0x80000000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: s_cmp_le_f32 s4, 0
; GFX12-GISEL-NEXT: s_fmac_f32 s6, s7, s2
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s3, s2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
; GFX12-GISEL-NEXT: s_cmp_gt_f32 s6, 0
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1
; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -265,15 +276,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
; GFX12-LABEL: srcmods_abs_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_bitset0_b32 s0, 31
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_log_f32 s0, s0
; GFX12-NEXT: s_sub_f32 s0, s0, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%abs = call float @llvm.fabs.f32(float %src)
@@ -287,27 +301,33 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_xor_b32 s1, s0, 0x80000000
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: srcmods_neg_f32:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: ; return to shader part epilog
%neg = fneg float %src
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index 4ea77d1d1ac15..b7aecca45def5 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -63,10 +63,11 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v2, s30, 0
; GFX12-NEXT: s_getpc_b64 s[0:1]
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
@@ -74,16 +75,19 @@ define void @test_remat_s_getpc_b64() {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_getpc_b64 s[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sext_i32_i16 s1, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_readlane_b32 s31, v2, 1
; GFX12-NEXT: v_readlane_b32 s30, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = tail call i64 @llvm.amdgcn.s.getpc()
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
index 50a3336a7483c..faafb77782cae 100644
--- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -31,6 +31,7 @@ define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -60,6 +61,7 @@ define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nnan i1 %cmp, float %a, float %b
@@ -89,6 +91,7 @@ define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nsz i1 %cmp, float %a, float %b
@@ -116,6 +119,7 @@ define float @v_test_fmin_legacy_ule_f32_nnan_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nnan nsz i1 %cmp, float %a, float %b
@@ -145,6 +149,7 @@ define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -174,6 +179,7 @@ define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nnan i1 %cmp, float %a, float %b
@@ -203,6 +209,7 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nsz i1 %cmp, float %a, float %b
@@ -230,6 +237,7 @@ define float @v_test_fmax_legacy_uge_f32_nnan_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nnan nsz i1 %cmp, float %a, float %b
@@ -264,6 +272,7 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -298,6 +307,7 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -332,6 +342,7 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x f
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -366,6 +377,7 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, <
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -400,6 +412,7 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -434,6 +447,7 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -468,6 +482,7 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x f
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -502,6 +517,7 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, <
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -535,6 +551,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select i1 %cmp, half %a, half %b
@@ -568,6 +585,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nnan i1 %cmp, half %a, half %b
@@ -601,6 +619,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nsz i1 %cmp, half %a, half %b
@@ -632,6 +651,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nnan nsz i1 %cmp, half %a, half %b
@@ -665,6 +685,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select i1 %cmp, half %a, half %b
@@ -698,6 +719,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nnan i1 %cmp, half %a, half %b
@@ -731,6 +753,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nsz i1 %cmp, half %a, half %b
@@ -762,6 +785,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nnan nsz i1 %cmp, half %a, half %b
@@ -812,6 +836,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -862,6 +887,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -912,6 +938,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -948,6 +975,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -998,6 +1026,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1048,6 +1077,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1098,6 +1128,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1134,6 +1165,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1209,6 +1241,7 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1284,6 +1317,7 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1359,6 +1393,7 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1407,6 +1442,7 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1482,6 +1518,7 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1557,6 +1594,7 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1632,6 +1670,7 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1680,6 +1719,7 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1713,6 +1753,7 @@ define float @v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float
; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a = fadd nnan float %arg0, %arg0
%b = fadd nnan float %arg1, %arg1
@@ -1748,6 +1789,7 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float
; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a = fadd nnan float %arg0, %arg0
%b = fadd nnan float %arg1, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 65feaf23ae2cb..ab222f4feeef0 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -45,18 +45,12 @@
name: mask_hazard_getpc1
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_getpc1
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_getpc1
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_getpc1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$sgpr0_sgpr1 = S_GETPC_B64
$sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
@@ -67,24 +61,15 @@ body: |
name: mask_hazard_getpc2
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_getpc2
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
- ; GFX11-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
- ; GFX11-NEXT: }
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_getpc2
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
- ; GFX12-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc, implicit $scc
- ; GFX12-NEXT: }
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_getpc2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
+ ; GCN-NEXT: }
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
BUNDLE implicit-def $sgpr0_sgpr1 {
$sgpr0_sgpr1 = S_GETPC_B64
@@ -523,18 +508,12 @@ body: |
name: mask_hazard_subreg4
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_subreg4
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_subreg4
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
- ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_subreg4
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_lo = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_lo
@@ -546,18 +525,12 @@ body: |
name: mask_hazard_subreg5
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_subreg5
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_subreg5
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
- ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
- ; GFX12-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_subreg5
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
$vcc_hi = S_MOV_B32 0
$sgpr2 = S_MOV_B32 $vcc_hi
@@ -569,20 +542,13 @@ body: |
name: mask_hazard_waitcnt
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_waitcnt
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: S_WAITCNT 0
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_waitcnt
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: S_WAITCNT 0
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_waitcnt
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
S_WAITCNT 0
$sgpr0_sgpr1 = S_GETPC_B64
@@ -595,22 +561,14 @@ body: |
name: mask_hazard_gap1
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap1
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap1
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
- ; GFX12-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
$vgpr3 = V_MOV_B32_e32 0, implicit $exec
@@ -624,20 +582,13 @@ body: |
name: mask_hazard_gap2
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap2
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap2
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
$sgpr0_sgpr1 = S_GETPC_B64
@@ -650,20 +601,13 @@ body: |
name: mask_hazard_gap3
body: |
bb.0:
- ; GFX11-LABEL: name: mask_hazard_gap3
- ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX11-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
- ; GFX11-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
- ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX11-NEXT: S_ENDPGM 0
- ;
- ; GFX12-LABEL: name: mask_hazard_gap3
- ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
- ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
- ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
- ; GFX12-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
- ; GFX12-NEXT: S_ENDPGM 0
+ ; GCN-LABEL: name: mask_hazard_gap3
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0
$vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
$vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
$sgpr0_sgpr1 = S_GETPC_B64
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
new file mode 100644
index 0000000000000..d84fe9bf21d65
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -0,0 +1,848 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
+# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
+
+--- |
+ @mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
+
+ define amdgpu_gs void @hazard_getpc1() { ret void }
+ define amdgpu_gs void @hazard_getpc2() { ret void }
+ define amdgpu_gs void @hazard_getpc3() { ret void }
+ define amdgpu_gs void @hazard_getpc4() { ret void }
+ define amdgpu_gs void @hazard_vcc1() { ret void }
+ define amdgpu_gs void @hazard_vcc2() { ret void }
+ define amdgpu_gs void @hazard_vcc3() { ret void }
+ define amdgpu_gs void @hazard_addc1() { ret void }
+ define amdgpu_gs void @hazard_addc2() { ret void }
+ define amdgpu_gs void @hazard_addc3() { ret void }
+ define amdgpu_gs void @hazard_addc4() { ret void }
+ define amdgpu_gs void @hazard_addc5() { ret void }
+ define amdgpu_gs void @hazard_addc6() { ret void }
+ define amdgpu_gs void @hazard_vaddc1() { ret void }
+ define amdgpu_gs void @hazard_gap1() { ret void }
+ define amdgpu_gs void @hazard_gap2() { ret void }
+ define amdgpu_gs void @hazard_gap3() { ret void }
+ define amdgpu_gs void @hazard_gap4_no_hazard() { ret void }
+ define amdgpu_gs void @hazard_valu_write1_no_hazard() { ret void }
+ define amdgpu_gs void @hazard_post_order1() { ret void }
+ define amdgpu_gs void @hazard_post_order2() { ret void }
+ define amdgpu_gs void @hazard_post_order_cycle() { ret void }
+ define amdgpu_cs void @hazard_calls() { ret void }
+...
+
+---
+name: hazard_getpc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc1
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc1
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc2
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc2
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr1, implicit $exec
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc3
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 20, implicit-def $scc, implicit $scc
+ ; GCN-O0-NEXT: }
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc3
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+ ; GCN-O2-NEXT: }
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 4, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 12, implicit-def $scc, implicit $scc
+ }
+ S_ENDPGM 0
+...
+
+---
+name: hazard_getpc4
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_getpc4
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 28, implicit-def $scc, implicit $scc
+ ; GCN-O0-NEXT: }
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_getpc4
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ ; GCN-O2-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 12, implicit-def $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 24, implicit-def $scc, implicit $scc
+ ; GCN-O2-NEXT: }
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ BUNDLE implicit-def $sgpr0_sgpr1 {
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr1 = S_SEXT_I32_I16 $sgpr1
+ $sgpr0 = S_ADD_U32 $sgpr0, target-flags(amdgpu-rel32-lo) @mem + 8, implicit-def $scc
+ $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-hi) @mem + 16, implicit-def $scc, implicit $scc
+ }
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc1
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc1
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2, implicit $exec
+ $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc2
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc2
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $vcc_lo, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vcc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vcc3
+ ; GCN-O0: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vcc3
+ ; GCN-O2: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc_lo, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_CSELECT_B32 -1, 0, implicit $scc
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr4, $vgpr5, implicit $vcc, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc2
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc2
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ $sgpr0 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc3
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc3
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc4
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc4
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc4
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr3, 0, implicit $exec
+ $sgpr3 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc5
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc5
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc5
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr16 = S_MOV_B32 0
+ $sgpr32 = S_MOV_B32 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_addc6
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_addc6
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr48 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr80 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr96 = S_MOV_B32 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_addc6
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr32 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr48 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr80 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr96 = S_MOV_B32 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr16 = S_MOV_B32 0
+ $sgpr32 = S_MOV_B32 0
+ $sgpr48 = S_MOV_B32 0
+ $sgpr80 = S_MOV_B32 0
+ $sgpr96 = S_MOV_B32 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_vaddc1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_vaddc1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_vaddc1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $vgpr2, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap1
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap1
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap2
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap2
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap2
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap3
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap3
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap3
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_gap4_no_hazard
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_gap4_no_hazard
+ ; GCN-O0: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_gap4_no_hazard
+ ; GCN-O2: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr1, $sgpr0, 0, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr4 = S_ADD_U32 $sgpr3, 0, implicit-def $scc
+ $sgpr6 = S_ADD_U32 $sgpr5, 0, implicit-def $scc
+ $sgpr8 = S_ADD_U32 $sgpr7, 0, implicit-def $scc
+ $sgpr10 = S_ADD_U32 $sgpr9, 0, implicit-def $scc
+ $sgpr12 = S_ADD_U32 $sgpr11, 0, implicit-def $scc
+ $sgpr14 = S_ADD_U32 $sgpr13, 0, implicit-def $scc
+ $sgpr16 = S_ADD_U32 $sgpr15, 0, implicit-def $scc
+ $sgpr18 = S_ADD_U32 $sgpr17, 0, implicit-def $scc
+ $sgpr20 = S_ADD_U32 $sgpr19, 0, implicit-def $scc
+ $sgpr22 = S_ADD_U32 $sgpr21, 0, implicit-def $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_valu_write1_no_hazard
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_valu_write1_no_hazard
+ ; GCN-O0: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ ; GCN-O0-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_valu_write1_no_hazard
+ ; GCN-O2: $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ ; GCN-O2-NEXT: $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ ; GCN-O2-NEXT: $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $sgpr1 = V_CMP_EQ_F32_e64 0, $vgpr0, 0, $vgpr1, 1, implicit $mode, implicit $exec
+ $sgpr1 = S_CSELECT_B32 -1, 0, implicit $scc
+ $sgpr2 = S_ADD_U32 $sgpr1, 0, implicit-def $scc
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order1
+body: |
+ bb.0:
+ ; GCN-O0-LABEL: name: hazard_post_order1
+ ; GCN-O0: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order1
+ ; GCN-O2: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order2
+body: |
+ ; GCN-O0-LABEL: name: hazard_post_order2
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_BRANCH %bb.1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order2
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: S_BRANCH %bb.1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: hazard_post_order_cycle
+body: |
+ ; GCN-O0-LABEL: name: hazard_post_order_cycle
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: S_NOP 0
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.2:
+ ; GCN-O0-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O0-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.3:
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_post_order_cycle
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: successors: %bb.1(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: S_NOP 0
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.2:
+ ; GCN-O2-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ ; GCN-O2-NEXT: S_CBRANCH_SCC0 %bb.1, implicit $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.3:
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ S_NOP 0
+
+ bb.1:
+ $sgpr0_sgpr1 = S_GETPC_B64
+ $sgpr3 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+
+ bb.2:
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0, implicit $exec
+ S_CBRANCH_SCC0 %bb.1, implicit $scc
+
+ bb.3:
+ S_ENDPGM 0
+...
+
+---
+name: hazard_calls
+frameInfo:
+ hasCalls: true
+body: |
+ ; GCN-O0-LABEL: name: hazard_calls
+ ; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.2:
+ ; GCN-O0-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ ; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.3:
+ ; GCN-O0-NEXT: successors: %bb.4(0x80000000)
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
+ ; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: bb.4:
+ ; GCN-O0-NEXT: S_ENDPGM 0
+ ;
+ ; GCN-O2-LABEL: name: hazard_calls
+ ; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.2:
+ ; GCN-O2-NEXT: successors: %bb.3(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.3:
+ ; GCN-O2-NEXT: successors: %bb.4(0x80000000)
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
+ ; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: bb.4:
+ ; GCN-O2-NEXT: S_ENDPGM 0
+ bb.0:
+ S_SETPC_B64 $sgpr0_sgpr1
+
+ bb.1:
+ S_SETPC_B64_return $sgpr0_sgpr1
+
+ bb.2:
+ $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
+
+ bb.3:
+ $sgpr8_sgpr9 = S_CALL_B64 0
+
+ bb.4:
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
index 3f40a57ca1491..e3b96c08348fc 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
@@ -1,11 +1,12 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
# GCN-LABEL: name: hazard_vcmpx_permlane16
# GCN: V_CMPX_LE_F32_nosdst_e32
# GCN: S_ADD_U32
# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_permlane16
@@ -128,6 +129,7 @@ body: |
# GCN: V_CMPX_LE_F32_nosdst_e32
# GCN: S_ADD_U32
# GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_permlane16_undef_src
@@ -150,6 +152,7 @@ body: |
# GCN: V_CMPX_LE_F32_nosdst_e64
# GCN: S_ADD_U32
# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GFX12-NEXT: S_WAITCNT_DEPCTR
# GCN-NEXT: V_PERMLANE16_B32_e64
---
name: hazard_vcmpx_e64_permlane16
>From 2261a7aa56fb04ebbccac6d3aa7136b750904fcb Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 23 Jul 2024 14:58:54 +0900
Subject: [PATCH 2/3] Refinements to reduce test coverage changes.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 132 +++++++++++-------
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 1 +
.../AMDGPU/GlobalISel/atomicrmw_fmax.ll | 4 -
.../AMDGPU/GlobalISel/atomicrmw_fmin.ll | 4 -
.../GlobalISel/clamp-fmed3-const-combine.ll | 8 --
.../GlobalISel/clamp-minmax-const-combine.ll | 17 ---
.../GlobalISel/extractelement-stack-lower.ll | 3 -
.../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 15 +-
.../GlobalISel/fmed3-min-max-const-combine.ll | 18 ---
.../llvm.amdgcn.global.atomic.csub.ll | 4 -
.../GlobalISel/llvm.amdgcn.rsq.clamp.ll | 8 --
.../AMDGPU/GlobalISel/load-constant.96.ll | 10 --
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 9 --
llvm/test/CodeGen/AMDGPU/abs_i16.ll | 8 --
.../AMDGPU/atomic_optimizations_buffer.ll | 19 +--
.../atomic_optimizations_global_pointer.ll | 42 +++---
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 19 +--
.../atomic_optimizations_struct_buffer.ll | 19 +--
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 2 -
.../buffer-fat-pointer-atomicrmw-fadd.ll | 6 -
.../buffer-fat-pointer-atomicrmw-fmax.ll | 2 -
.../buffer-fat-pointer-atomicrmw-fmin.ll | 2 -
.../test/CodeGen/AMDGPU/code-size-estimate.ll | 36 ++---
.../fast-unaligned-load-store.global.ll | 3 -
.../fast-unaligned-load-store.private.ll | 12 --
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 7 -
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 47 -------
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 14 --
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 14 --
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 28 ----
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 82 -----------
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 82 -----------
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 14 --
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 6 -
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 30 +---
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 12 --
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 12 --
.../AMDGPU/indirect-call-known-callees.ll | 2 -
.../insert_waitcnt_for_precise_memory.ll | 13 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 57 --------
.../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 6 -
.../llvm.amdgcn.buffer.load-last-use.ll | 2 -
.../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 28 ----
.../CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll | 20 ---
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 20 ---
.../AMDGPU/llvm.amdgcn.permlane.ptr.ll | 16 ---
...mdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll | 4 -
...amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll | 5 -
...m.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll | 5 -
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 7 -
.../CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll | 2 -
...cn.struct.ptr.buffer.atomic.fadd.v2bf16.ll | 2 -
...gcn.struct.ptr.buffer.atomic.fadd_nortn.ll | 4 -
...mdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll | 4 -
...mdgcn.struct.ptr.buffer.atomic.fmax.f32.ll | 8 --
...mdgcn.struct.ptr.buffer.atomic.fmin.f32.ll | 8 --
.../CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll | 1 -
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 21 ---
llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 20 ---
llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll | 22 ---
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 21 ---
llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 20 ---
llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll | 22 ---
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 4 -
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 3 +-
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 41 +++---
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 13 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 36 ++---
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 10 --
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 10 --
.../lower-work-group-id-intrinsics-hsa.ll | 1 -
.../lower-work-group-id-intrinsics-pal.ll | 1 -
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 17 ---
llvm/test/CodeGen/AMDGPU/mul.ll | 6 -
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 43 ------
.../CodeGen/AMDGPU/offset-split-global.ll | 43 ------
.../AMDGPU/pseudo-scalar-transcendental.ll | 59 ++++----
.../AMDGPU/select-flags-to-fmin-fmax.ll | 42 ------
.../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 17 ++-
79 files changed, 233 insertions(+), 1204 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 9815b8c0d5d73..45c2624e43d4c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -45,6 +45,10 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
cl::desc("Fill a percentage of the latency between "
"neighboring MFMA with s_nops."));
+static cl::opt<unsigned> MaxExhaustiveHazardSearch(
+ "amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden,
+ cl::desc("Maximum function size for exhausive hazard search"));
+
//===----------------------------------------------------------------------===//
// Hazard Recognizer Implementation
//===----------------------------------------------------------------------===//
@@ -52,15 +56,11 @@ static cl::opt<unsigned, false, MFMAPaddingRatioParser>
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
const GCNSubtarget &ST);
-GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
- IsHazardRecognizerMode(false),
- CurrCycleInstr(nullptr),
- MF(MF),
- ST(MF.getSubtarget<GCNSubtarget>()),
- TII(*ST.getInstrInfo()),
- TRI(TII.getRegisterInfo()),
- ClauseUses(TRI.getNumRegUnits()),
- ClauseDefs(TRI.getNumRegUnits()) {
+GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF)
+ : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),
+ ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ TRI(TII.getRegisterInfo()), UseVALUReadHazardExhaustiveSearch(false),
+ ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
TSchedModel.init(&ST);
RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
@@ -2933,12 +2933,19 @@ void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
if (!VALUReadHazardSGPRs.empty())
return;
- // Consider all SGPRs hazards if the shader uses function calls or is callee.
auto CallingConv = MF.getFunction().getCallingConv();
- bool UseVALUUseCache = AMDGPU::isEntryFunctionCC(CallingConv) &&
- !MF.getFrameInfo().hasCalls() &&
- MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
+ bool IsCallFree =
+ AMDGPU::isEntryFunctionCC(CallingConv) && !MF.getFrameInfo().hasCalls();
+ // Exhaustive search is only viable in non-caller/callee functions where
+ // VALUs will be exposed to the hazard recognizer.
+ UseVALUReadHazardExhaustiveSearch =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None &&
+ MF.getInstructionCount() <= MaxExhaustiveHazardSearch;
+
+ // Consider all SGPRs hazards if the shader uses function calls or is callee.
+ bool UseVALUUseCache =
+ IsCallFree && MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
VALUReadHazardSGPRs.resize(64, !UseVALUUseCache);
if (!UseVALUUseCache)
return;
@@ -2998,10 +3005,9 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
// 1. VALU reads SGPR
// 2. SALU writes SGPR
// 3. VALU/SALU reads SGPR
- // We do not search for (1) because the expiry point of the hazard
- // is indeterminate; however, the hazard between (2) and (3) can
- // expire if the gap contains sufficient SALU instructions with no
- // usage of SGPR from (1).
+ // Try to avoid searching for (1) because the expiry point of the hazard is
+ // indeterminate; however, the hazard between (2) and (3) can expire if the
+ // gap contains sufficient SALU instructions with no usage of SGPR from (1).
// Note: SGPRs must be considered as 64-bit pairs as hazard exists
// even if individual SGPRs are accessed.
@@ -3010,18 +3016,6 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (!(MIIsSALU || MIIsVALU))
return false;
- // Always mitigate before a call/return as the callee/caller will not
- // see the hazard chain, i.e. (2) to (3) described above.
- if (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
- MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
- MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
- MI->getOpcode() == AMDGPU::S_CALL_B64) {
- BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
- TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
- return true;
- }
-
// Avoid expensive search when compile time is priority by
// mitigating every SALU which writes an SGPR.
if (MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {
@@ -3058,41 +3052,54 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (VALUReadHazardSGPRs.none())
return false;
+ // All SGPR writes before a call/return must be flushed as the callee/caller
+ // will not will not see the hazard chain, i.e. (2) to (3) described above.
+ const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
+ MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
+ MI->getOpcode() == AMDGPU::S_CALL_B64);
+
// Collect all SGPR sources for MI which are read by a VALU.
const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallSet<Register, 4> SGPRsUsed;
- for (const MachineOperand &Op : MI->all_uses()) {
- Register OpReg = Op.getReg();
+ if (!IsSetPC) {
+ for (const MachineOperand &Op : MI->all_uses()) {
+ Register OpReg = Op.getReg();
- // Only consider VCC implicit uses on VALUs.
- // The only expected SALU implicit access is SCC which is no hazard.
- if (MIIsSALU && Op.isImplicit())
- continue;
+ // Only consider VCC implicit uses on VALUs.
+ // The only expected SALU implicit access is SCC which is no hazard.
+ if (MIIsSALU && Op.isImplicit())
+ continue;
- if (!TRI.isSGPRReg(MRI, OpReg))
- continue;
+ if (!TRI.isSGPRReg(MRI, OpReg))
+ continue;
- // Ignore special purposes registers such as NULL, EXEC, and M0.
- if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
- continue;
+ // Ignore special purposes registers such as NULL, EXEC, and M0.
+ if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
+ continue;
- unsigned RegN = baseSGPRNumber(OpReg, TRI);
- if (!VALUReadHazardSGPRs[RegN])
- continue;
+ unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ if (!VALUReadHazardSGPRs[RegN])
+ continue;
- SGPRsUsed.insert(OpReg);
- }
+ SGPRsUsed.insert(OpReg);
+ }
- // No SGPRs -> nothing to do.
- if (SGPRsUsed.empty())
- return false;
+ // No SGPRs -> nothing to do.
+ if (SGPRsUsed.empty())
+ return false;
+ }
// A hazard is any SALU which writes one of the SGPRs read by MI.
- auto IsHazardFn = [this, &SGPRsUsed](const MachineInstr &I) {
+ auto IsHazardFn = [this, IsSetPC, &SGPRsUsed](const MachineInstr &I) {
if (!SIInstrInfo::isSALU(I))
return false;
+ // Ensure SGPR flush before call/return by conservatively assuming every
+ // SALU writes an SGPR.
+ if (IsSetPC && I.getNumDefs() > 0)
+ return true;
// Check for any register writes.
return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
return I.modifiesRegister(Reg, &TRI);
@@ -3131,6 +3138,33 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (WaitStates >= SALUExpiryCount)
return false;
+ // Validate hazard through an exhaustive search.
+ if (UseVALUReadHazardExhaustiveSearch) {
+ // A hazard is any VALU which reads one of the paired SGPRs read by MI.
+ // This is searching for (1) in the hazard description.
+ auto hazardPair = [this](Register Reg) {
+ if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
+ return Register(AMDGPU::VCC);
+ // TODO: handle TTMP?
+ return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI));
+ };
+ auto SearchHazardFn = [this, hazardPair,
+ &SGPRsUsed](const MachineInstr &I) {
+ if (!SIInstrInfo::isVALU(I))
+ return false;
+ // Check for any register reads.
+ return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
+ return I.readsRegister(hazardPair(Reg), &TRI);
+ });
+ };
+ auto SearchExpiredFn = [&](const MachineInstr &I, int Count) {
+ return false;
+ };
+ if (::getWaitStatesSince(SearchHazardFn, MI, SearchExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+ }
+
// Add s_wait_alu sa_sdst(0) before SALU read.
auto NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 31d5b3d517193..93b4b3771434b 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -49,6 +49,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
TargetSchedModel TSchedModel;
bool RunLdsBranchVmemWARHazardFixup;
BitVector VALUReadHazardSGPRs;
+ bool UseVALUReadHazardExhaustiveSearch;
/// RegUnits of uses in the current soft memory clause.
BitVector ClauseUses;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 40fc7139c2646..95089d4ddbb18 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -23,7 +23,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
@@ -97,7 +96,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32:
@@ -171,7 +169,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64:
@@ -249,7 +246,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index ddb3b7d11185f..961e6ba59c512 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -23,7 +23,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32:
@@ -97,7 +96,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32:
@@ -171,7 +169,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64:
@@ -249,7 +246,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index 889a33432be06..c7676e9da6f49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -17,7 +17,6 @@ define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call nnan float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -39,7 +38,6 @@ define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%fmed = call nnan half @llvm.amdgcn.fmed3.f16(half %fmul, half 0.0, half 1.0)
@@ -65,7 +63,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmin = call float @llvm.minnum.f32(float %a, float 10.0)
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -88,7 +85,6 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -112,7 +108,6 @@ define float @test_fmed3_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -140,7 +135,6 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -167,7 +161,6 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e64 v0, 0x41200000, v0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmin = call float @llvm.minnum.f32(float %a, float 10.0)
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -190,7 +183,6 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 55e38e3a32162..ca0047bba6c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -17,7 +17,6 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call nnan float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -40,7 +39,6 @@ define double @test_min_max_K0Val_K1_f64(double %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul double %a, 2.0
%maxnum = call nnan double @llvm.maxnum.f64(double 0.0, double %fmul)
@@ -64,7 +62,6 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0)
@@ -87,7 +84,6 @@ define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half 0.0>, <2 x half> %fmul)
@@ -110,7 +106,6 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half undef>, <2 x half> %fmul)
@@ -135,7 +130,6 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call nnan float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -158,7 +152,6 @@ define double @test_max_min_K1Val_K0_f64(double %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f64_e64 v[0:1], v[0:1], 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul double %a, 2.0
%minnum = call nnan double @llvm.minnum.f64(double 1.0, double %fmul)
@@ -181,7 +174,6 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f16_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul half %a, 2.0
%minnum = call nnan half @llvm.minnum.f16(half %fmul, half 1.0)
@@ -205,7 +197,6 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
%minnum = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> <half 1.0, half undef>, <2 x half> %fmul)
@@ -230,7 +221,6 @@ define float @test_min_max_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
@@ -252,7 +242,6 @@ define float @test_max_min_global_nnan(float %a) #3 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 1.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
@@ -280,7 +269,6 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 1.0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 0.0)
@@ -304,7 +292,6 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0, 1.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 1.0)
@@ -331,7 +318,6 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -356,7 +342,6 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_f32_e64 v0, v0, 2.0 clamp
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -385,7 +370,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -412,7 +396,6 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_mul_f32_e32 v0, 2.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 1.0, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fmul = fmul float %a, 2.0
%minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 21f8e188ff3d2..43f3dcc86f426 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -31,7 +31,6 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 %idx
@@ -64,7 +63,6 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_u16 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <128 x i16>, ptr addrspace(1) %ptr
%elt = extractelement <128 x i16> %vec, i32 %idx
@@ -97,7 +95,6 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%vec = load <32 x i64>, ptr addrspace(1) %ptr
%elt = extractelement <32 x i64> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 9c166cc0e6222..defa4a38ab7c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -83,8 +83,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
@@ -266,7 +266,6 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
@@ -318,7 +317,6 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -412,8 +410,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
@@ -626,7 +624,6 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -730,8 +727,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
; GFX12-NEXT: s_and_b32 s0, s0, 15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
@@ -948,7 +945,6 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -1120,7 +1116,6 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
@@ -1275,7 +1270,6 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -1341,7 +1335,6 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -1427,7 +1420,6 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -1519,7 +1511,6 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index 183a657df83c0..75c4cd53e3bfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -24,7 +24,6 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -52,7 +51,6 @@ define float @test_min_max_K0Val_K1_f32(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float 2.0, float %a)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -87,7 +85,6 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 {
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call half @llvm.maxnum.f16(half %a, half 2.0)
%fmed = call half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -116,7 +113,6 @@ define half @test_min_K1max_K0Val_f16(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan half @llvm.maxnum.f16(half 2.0, half %a)
%fmed = call nnan half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -145,7 +141,6 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -173,7 +168,6 @@ define float @test_max_min_K1Val_K0_f32(float %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float 4.0, float %a)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -202,7 +196,6 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan half @llvm.minnum.f16(half %a, half 4.0)
%fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -231,7 +224,6 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f16 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan half @llvm.minnum.f16(half 4.0, half %a)
%fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -261,7 +253,6 @@ define float @test_min_max_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -289,7 +280,6 @@ define float @test_max_min_global_nnan(float %a) #2 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -324,7 +314,6 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 4.0, 2.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 4.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 2.0)
@@ -355,7 +344,6 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call nnan float @llvm.minnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 4.0)
@@ -386,7 +374,6 @@ define float @test_min_max_non_inline_const(float %a) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maxmin_num_f32 v0, v0, 2.0, 0x41000000
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 8.0)
@@ -420,7 +407,6 @@ define double @test_min_max_f64(double %a) #0 {
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 2.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 4.0, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan double @llvm.maxnum.f64(double %a, double 2.0)
%fmed = call nnan double @llvm.minnum.f64(double %maxnum, double 4.0)
@@ -457,7 +443,6 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, 4.0 op_sel_hi:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
%fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> %maxnum, <2 x half> <half 4.0, half 4.0>)
@@ -492,7 +477,6 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_med3_num_f32 v0, v0, 2.0, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
%fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -526,7 +510,6 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -561,7 +544,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 4.0, 2.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%minnum = call float @llvm.minnum.f32(float %a, float 4.0)
%fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 623616740f9be..59818b0b1bc39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -28,7 +28,6 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
ret i32 %ret
@@ -62,7 +61,6 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
@@ -93,7 +91,6 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
ret void
@@ -127,7 +124,6 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index cbc143b738950..aa337e100dfe6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -29,7 +29,6 @@ define float @v_rsq_clamp_f32(float %src) #0 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
@@ -61,7 +60,6 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
@@ -102,7 +100,6 @@ define double @v_rsq_clamp_f64(double %src) #0 {
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
@@ -142,7 +139,6 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call double @llvm.fabs.f64(double %src)
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
@@ -175,7 +171,6 @@ define float @v_rsq_clamp_undef_f32() #0 {
; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
ret float %rsq_clamp
@@ -215,7 +210,6 @@ define double @v_rsq_clamp_undef_f64() #0 {
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
ret double %rsq_clamp
@@ -247,7 +241,6 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
@@ -287,7 +280,6 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
; GFX12-NEXT: s_mov_b32 s1, 0xffefffff
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], s[0:1], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index a0853f9b9808f..6bb104311a4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -18,7 +18,6 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
-; GFX12-UNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -63,7 +62,6 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
; GFX12-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
-; GFX12-NOUNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -234,7 +232,6 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
-; GFX12-UNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -257,7 +254,6 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
-; GFX12-NOUNALIGNED-NEXT: s_wait_alu 0xfffe
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
@@ -359,7 +355,6 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
@@ -406,7 +401,6 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
@@ -453,7 +447,6 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
@@ -500,7 +493,6 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
@@ -568,7 +560,6 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
; GFX12-NEXT: v_mov_b32_e32 v8, v2
; GFX12-NEXT: v_mov_b32_e32 v2, v12
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v12i8_align8:
@@ -650,7 +641,6 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index ca9685d9a0f8f..b0f3eee3c7363 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -79,7 +79,6 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -165,7 +164,6 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -255,7 +253,6 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@@ -301,7 +298,6 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
@@ -353,7 +349,6 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
@@ -514,7 +509,6 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
@@ -697,7 +691,6 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1046,7 +1039,6 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
@@ -2477,7 +2469,6 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 3124d5e7ed7d8..daed0986fa9c8 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -63,7 +63,6 @@ define i16 @abs_i16(i16 %arg) {
; GFX12-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_i16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
@@ -139,7 +138,6 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX12-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
@@ -233,7 +231,6 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
@@ -342,7 +339,6 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %arg, i1 false)
ret <4 x i16> %res
@@ -482,7 +478,6 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v1, v1, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_pk_max_i16 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <6 x i16> @llvm.abs.v6i16(<6 x i16> %arg, i1 false)
ret <6 x i16> %res
@@ -655,7 +650,6 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_pk_max_i16 v2, v2, v6
; GFX12-NEXT: v_pk_max_i16 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %arg, i1 false)
ret <8 x i16> %res
@@ -949,7 +943,6 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v3, v3, v8
; GFX12-NEXT: v_pk_max_i16 v4, v4, v9
; GFX12-NEXT: v_pk_max_i16 v5, v5, v10
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
ret <16 x i16> %res
@@ -1494,7 +1487,6 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX12-NEXT: v_pk_max_i16 v13, v13, v18
; GFX12-NEXT: v_pk_max_i16 v14, v14, v19
; GFX12-NEXT: v_pk_max_i16 v15, v15, v20
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 36ff06b2af57a..47548593691b5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -224,9 +224,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -258,7 +258,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -266,6 +265,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -274,6 +274,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -510,9 +511,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -545,7 +546,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -553,6 +553,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
@@ -566,6 +567,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -1734,9 +1736,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1769,7 +1771,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1777,6 +1778,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -1785,6 +1787,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -2025,9 +2028,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -2061,7 +2064,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2069,6 +2071,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB6_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9398d686c4475..f5609ea33a89a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -264,9 +264,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -304,13 +304,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB0_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_alu 0xfffe
@@ -326,6 +326,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB0_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
@@ -601,9 +602,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -641,13 +642,13 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB1_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -1607,9 +1608,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
@@ -1918,9 +1918,9 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1960,7 +1960,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1968,6 +1967,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_alu 0xfffe
@@ -2311,9 +2311,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2356,7 +2356,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2364,6 +2363,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -3727,7 +3727,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4025,9 +4024,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -4066,13 +4065,13 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s5, exec_lo
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB6_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_alu 0xfffe
@@ -4088,6 +4087,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB6_2:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
@@ -4368,9 +4368,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -4409,13 +4409,13 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB7_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -5376,9 +5376,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
@@ -5701,9 +5700,9 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1264-NEXT: s_mov_b64 s[6:7], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5746,7 +5745,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232-NEXT: s_mov_b32 s4, exec_lo
; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
; GFX1232-NEXT: s_mov_b32 s6, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5754,6 +5752,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_wait_alu 0xfffe
@@ -6113,9 +6112,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6162,7 +6161,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX1232-NEXT: s_mov_b32 s9, exec_lo
; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6170,6 +6168,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
+; GFX1232-NEXT: s_wait_alu 0xfffe
; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
@@ -7537,7 +7536,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 96309ccddb4ea..823a65e99a139 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -223,9 +223,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -257,7 +257,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -265,6 +264,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -273,6 +273,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -509,9 +510,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -544,7 +545,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -552,6 +552,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
@@ -565,6 +566,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -1312,9 +1314,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1347,7 +1349,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1355,6 +1356,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB4_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -1363,6 +1365,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1603,9 +1606,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1639,7 +1642,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1647,6 +1649,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index f6b9f8ba058dd..a4cb8314b30a1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -230,9 +230,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -265,7 +265,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -273,6 +272,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB0_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -281,6 +281,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -524,9 +525,9 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -560,7 +561,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -568,6 +568,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB1_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
@@ -581,6 +582,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
@@ -1476,9 +1478,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64: ; %bb.0: ; %entry
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1512,7 +1514,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32: ; %bb.0: ; %entry
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
; GFX12W32-NEXT: s_mov_b32 s0, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1520,6 +1521,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_cbranch_execz .LBB5_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_mul_i32 s1, s1, 5
@@ -1528,6 +1530,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12W32-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
@@ -1775,9 +1778,9 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: s_load_b32 s6, s[2:3], 0x44
; GFX12W64-NEXT: s_mov_b64 s[4:5], exec
; GFX12W64-NEXT: s_mov_b64 s[0:1], exec
-; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W64-NEXT: ; implicit-def: $vgpr1
+; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX12W64-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -1812,7 +1815,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_load_b32 s0, s[2:3], 0x44
; GFX12W32-NEXT: s_mov_b32 s4, exec_lo
; GFX12W32-NEXT: s_mov_b32 s1, exec_lo
-; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX12W32-NEXT: ; implicit-def: $vgpr1
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1820,6 +1822,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_cbranch_execz .LBB6_2
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[2:3], 0x34
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 0fe447597d4ba..2135b094c11de 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -221,7 +221,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret float %res
@@ -357,7 +356,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1200-NEXT: global_inv scope:SCOPE_SE
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 6b5d1eb4cd41c..5d3df857096d0 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -27,7 +27,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
@@ -241,7 +240,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
@@ -4875,7 +4873,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -5148,7 +5145,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5907,7 +5903,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
@@ -6316,7 +6311,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index f299b5474f100..fc764b3776df0 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -27,7 +27,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
@@ -228,7 +227,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 8418bcbbd760f..71493419c1be4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -27,7 +27,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset:
@@ -228,7 +227,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset:
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 0d7d6e8331418..ac03d2dae8fa8 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -34,13 +34,12 @@ define float @v_mul_f32_vop2(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x10]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, %y
ret float %mul
}
; NOT-GFX12: codeLenInByte = 12
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 28
define float @v_mul_f32_vop2_inline_imm(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_inline_imm:
@@ -69,13 +68,12 @@ define float @v_mul_f32_vop2_inline_imm(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, 4.0, v0 ; encoding: [0xf6,0x00,0x00,0x10]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, 4.0
ret float %mul
}
; NOT-GFX12: codeLenInByte = 12
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 28
define float @v_mul_f32_vop2_literal(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_literal:
@@ -104,13 +102,12 @@ define float @v_mul_f32_vop2_literal(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, 0x42f60000, v0 ; encoding: [0xff,0x00,0x00,0x10,0x00,0x00,0xf6,0x42]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%mul = fmul float %x, 123.0
ret float %mul
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods:
@@ -139,14 +136,13 @@ define float @v_mul_f32_vop3_src_mods(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0x03,0x02,0x00]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, %y
ret float %mul
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods_inline_imm:
@@ -175,7 +171,6 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, |v0|, 4.0 ; encoding: [0x00,0x01,0x08,0xd5,0x00,0xed,0x01,0x00]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 4.0
@@ -183,7 +178,7 @@ define float @v_mul_f32_vop3_src_mods_inline_imm(float %x, float %y) {
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX9-LABEL: v_mul_f32_vop3_src_mods_literal:
@@ -213,7 +208,6 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e64 v0, 0x42f60000, |v0| ; encoding: [0x00,0x02,0x08,0xd5,0xff,0x00,0x02,0x00,0x00,0x00,0xf6,0x42]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%mul = fmul float %fabs.x, 123.0
@@ -223,7 +217,7 @@ define float @v_mul_f32_vop3_src_mods_literal(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 20
-; GFX1200: codeLenInByte = 40
+; GFX1200: codeLenInByte = 36
define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX9-LABEL: v_mul_f32_vop2_frame_index:
@@ -254,7 +248,6 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_mul_f32_e32 v0, s32, v0 ; encoding: [0x20,0x00,0x00,0x10]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%alloca = alloca i32, addrspace(5)
%ptrtoint = ptrtoint ptr addrspace(5) %alloca to i32
@@ -266,7 +259,7 @@ define float @v_mul_f32_vop2_frame_index(float %x) {
; GFX9: codeLenInByte = 20
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 12
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 28
define float @v_fma_f32(float %x, float %y, float %z) {
; GFX9-LABEL: v_fma_f32:
@@ -295,14 +288,13 @@ define float @v_fma_f32(float %x, float %y, float %z) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, v0, v1, v2 ; encoding: [0x00,0x00,0x13,0xd6,0x00,0x03,0x0a,0x04]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float %z)
ret float %fma
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
; GFX9-LABEL: v_fma_f32_src_mods:
@@ -331,7 +323,6 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, |v0|, v1, v2 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0x0a,0x04]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
@@ -339,7 +330,7 @@ define float @v_fma_f32_src_mods(float %x, float %y, float %z) {
}
; NOT-GFX12: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_fmac_f32(float %x, float %y) {
; GFX9-LABEL: v_fmac_f32:
@@ -368,7 +359,6 @@ define float @v_fmac_f32(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fmac_f32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x56]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float %x)
ret float %fma
@@ -377,7 +367,7 @@ define float @v_fmac_f32(float %x, float %y) {
; GFX9: codeLenInByte = 16
; GFX10: codeLenInByte = 12
; GFX11: codeLenInByte = 12
-; GFX1200: codeLenInByte = 32
+; GFX1200: codeLenInByte = 28
define float @v_fmaak_f32(float %x, float %y) {
; GFX9-LABEL: v_fmaak_f32:
@@ -407,7 +397,6 @@ define float @v_fmaak_f32(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fmaak_f32 v0, v0, v1, 0x43800000 ; encoding: [0x00,0x03,0x00,0x5a,0x00,0x00,0x80,0x43]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
ret float %fma
@@ -416,7 +405,7 @@ define float @v_fmaak_f32(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 16
; GFX11: codeLenInByte = 16
-; GFX1200: codeLenInByte = 36
+; GFX1200: codeLenInByte = 32
define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX9-LABEL: v_fma_k_f32_src_mods:
@@ -446,7 +435,6 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX1200-NEXT: s_wait_bvhcnt 0x0 ; encoding: [0x00,0x00,0xc3,0xbf]
; GFX1200-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf]
; GFX1200-NEXT: v_fma_f32 v0, |v0|, v1, 0x43800000 ; encoding: [0x00,0x01,0x13,0xd6,0x00,0x03,0xfe,0x03,0x00,0x00,0x80,0x43]
-; GFX1200-NEXT: s_wait_alu 0xfffe ; encoding: [0xfe,0xff,0x88,0xbf]
; GFX1200-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float 256.0)
@@ -456,7 +444,7 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 20
-; GFX1200: codeLenInByte = 40
+; GFX1200: codeLenInByte = 36
define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
; GFX9-LABEL: s_fmaak_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 90cc7f2c38599..7252c69cb1cf7 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -58,7 +58,6 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 2
@@ -205,7 +204,6 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 1
@@ -345,7 +343,6 @@ define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index fc4a9892ca2dc..f9694dcd89abf 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -82,7 +82,6 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align2:
@@ -94,7 +93,6 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -180,7 +178,6 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align2:
@@ -192,7 +189,6 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 2
@@ -282,7 +278,6 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align1:
@@ -294,7 +289,6 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 1
@@ -385,7 +379,6 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align1:
@@ -397,7 +390,6 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 1
@@ -472,7 +464,6 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v0, off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_load_2xi16_align4:
@@ -484,7 +475,6 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off
; GFX12-FLASTSCR-NEXT: s_wait_loadcnt 0x0
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
%p.0 = load i16, ptr addrspace(5) %p, align 4
@@ -567,7 +557,6 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FLASTSCR-LABEL: private_store_2xi16_align4:
@@ -579,7 +568,6 @@ define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r
; GFX12-FLASTSCR-NEXT: s_wait_kmcnt 0x0
; GFX12-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX12-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off
-; GFX12-FLASTSCR-NEXT: s_wait_alu 0xfffe
; GFX12-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
store i16 1, ptr addrspace(5) %r, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index a260caafa3afe..f0ce96af90649 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -3030,7 +3030,6 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
ret <2 x float> %canon
@@ -3070,7 +3069,6 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
ret <3 x float> %canon
@@ -3112,7 +3110,6 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
ret <4 x float> %canon
@@ -3166,7 +3163,6 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
ret <8 x float> %canon
@@ -3203,7 +3199,6 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
ret <2 x double> %canon
@@ -3244,7 +3239,6 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
ret <3 x double> %canon
@@ -3289,7 +3283,6 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5]
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
ret <4 x double> %canon
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index b2494e394f7ec..608c05275d00d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -25,7 +25,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -203,7 +202,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -391,7 +389,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -587,7 +584,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -795,7 +791,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1014,7 +1009,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -1791,7 +1785,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
@@ -2010,7 +2003,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
@@ -2229,7 +2221,6 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
@@ -2452,7 +2443,6 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2630,7 +2620,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2818,7 +2807,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -3014,7 +3002,6 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -3222,7 +3209,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -3441,7 +3427,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -4409,7 +4394,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -4587,7 +4571,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -4795,7 +4778,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
@@ -4973,7 +4955,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
@@ -5181,7 +5162,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -5359,7 +5339,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
@@ -5567,7 +5546,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
@@ -5745,7 +5723,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
@@ -13196,7 +13173,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13385,7 +13361,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13577,7 +13552,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -13783,7 +13757,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -13964,7 +13937,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14152,7 +14124,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -14355,7 +14326,6 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14549,7 +14519,6 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14739,7 +14708,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
@@ -14928,7 +14896,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
@@ -15109,7 +15076,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -15298,7 +15264,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -15483,7 +15448,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -15760,7 +15724,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16040,7 +16003,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16334,7 +16296,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
@@ -16603,7 +16564,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -16879,7 +16839,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -17170,7 +17129,6 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17452,7 +17410,6 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -17730,7 +17687,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
@@ -18007,7 +17963,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
@@ -18276,7 +18231,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -18553,7 +18507,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 7d4a8b6480e30..89c763159fe0c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -25,7 +25,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -169,7 +168,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -319,7 +317,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -484,7 +481,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -626,7 +622,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -775,7 +770,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -1373,7 +1367,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory:
@@ -1517,7 +1510,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -1665,7 +1657,6 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -1809,7 +1800,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -1959,7 +1949,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2124,7 +2113,6 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2266,7 +2254,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2415,7 +2402,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index 165c2c8f4165f..e337fffcd09c5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -25,7 +25,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -169,7 +168,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -319,7 +317,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -484,7 +481,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -626,7 +622,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -775,7 +770,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -1373,7 +1367,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory:
@@ -1517,7 +1510,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
@@ -1665,7 +1657,6 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -1809,7 +1800,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -1959,7 +1949,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
@@ -2124,7 +2113,6 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory:
@@ -2266,7 +2254,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
@@ -2415,7 +2402,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index baee2c7c839a4..dc452823416f8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -277,7 +277,6 @@ define void @zero_init_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_foo:
@@ -368,7 +367,6 @@ define void @zero_init_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i16], align 2, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
@@ -907,7 +905,6 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_foo:
@@ -980,7 +977,6 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
@@ -1024,7 +1020,6 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:4
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: private_ptr_foo:
@@ -1066,7 +1061,6 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -1371,7 +1365,6 @@ define void @zero_init_small_offset_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_small_offset_foo:
@@ -1472,7 +1465,6 @@ define void @zero_init_small_offset_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -2170,7 +2162,6 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -2256,7 +2247,6 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
@@ -2588,7 +2578,6 @@ define void @zero_init_large_offset_foo() {
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416
; GFX12-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: zero_init_large_offset_foo:
@@ -2730,7 +2719,6 @@ define void @zero_init_large_offset_foo() {
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16400
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16416
; GFX12-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16432
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -3432,7 +3420,6 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3520,7 +3507,6 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3770,7 +3756,6 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3847,7 +3832,6 @@ define void @store_load_large_imm_offset_foo() {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
@@ -4061,7 +4045,6 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i64_aligned:
@@ -4120,7 +4103,6 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -4174,7 +4156,6 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i64_unaligned:
@@ -4233,7 +4214,6 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -4291,7 +4271,6 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
@@ -4355,7 +4334,6 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -4415,7 +4393,6 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
@@ -4482,7 +4459,6 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
@@ -4535,7 +4511,6 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
@@ -4604,7 +4579,6 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
@@ -4659,7 +4633,6 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
@@ -4729,7 +4702,6 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
-; GFX12-PAL-NEXT: s_wait_alu 0xfffe
; GFX12-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 745180f242afa..27282a453075b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -11,7 +11,6 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32:
@@ -39,7 +38,6 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_commute:
@@ -95,7 +93,6 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs0:
@@ -124,7 +121,6 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs1:
@@ -153,7 +149,6 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs2:
@@ -182,7 +177,6 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
@@ -213,7 +207,6 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
@@ -244,7 +237,6 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
@@ -278,7 +270,6 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg0:
@@ -307,7 +298,6 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg1:
@@ -336,7 +326,6 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_fneg2:
@@ -365,7 +354,6 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 0x41000000, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_const0:
@@ -393,7 +381,6 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 0x41000000
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32__const2:
@@ -421,7 +408,6 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 4.0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
@@ -449,7 +435,6 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
@@ -479,7 +464,6 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
@@ -508,7 +492,6 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v4, v0, v2
; GFX12-NEXT: v_maximum3_f32 v1, v5, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32:
@@ -543,7 +526,6 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, v4
; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32_commute:
@@ -578,7 +560,6 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v2|, |v4|
; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v3|, |v5|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
@@ -616,7 +597,6 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v2, -v4
; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v3, -v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
@@ -654,7 +634,6 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v2
; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
@@ -689,7 +668,6 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, 4.0
; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
@@ -725,7 +703,6 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
; GFX12-NEXT: v_maximum3_f32 v0, v6, v0, v3
; GFX12-NEXT: v_maximum3_f32 v1, v7, v1, v4
; GFX12-NEXT: v_maximum3_f32 v2, v8, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32:
@@ -767,7 +744,6 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, v6
; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, v7
; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, v8
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32_commute:
@@ -809,7 +785,6 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v3|, |v6|
; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v4|, |v7|
; GFX12-NEXT: v_maximum3_f32 v2, |v2|, |v5|, |v8|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
@@ -854,7 +829,6 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v3, -v6
; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v4, -v7
; GFX12-NEXT: v_maximum3_f32 v2, -v2, -v5, -v8
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
@@ -899,7 +873,6 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v3
; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v4
; GFX12-NEXT: v_maximum3_f32 v2, v2, 2.0, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
@@ -941,7 +914,6 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, 4.0
; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, 4.0
; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
@@ -982,7 +954,6 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16:
@@ -1010,7 +981,6 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_commute:
@@ -1070,7 +1040,6 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs0:
@@ -1099,7 +1068,6 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs1:
@@ -1128,7 +1096,6 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs2:
@@ -1157,7 +1124,6 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
@@ -1188,7 +1154,6 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
@@ -1219,7 +1184,6 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
@@ -1253,7 +1217,6 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg0:
@@ -1282,7 +1245,6 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg1:
@@ -1311,7 +1273,6 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_fneg2:
@@ -1340,7 +1301,6 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_const0:
@@ -1368,7 +1328,6 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16__const2:
@@ -1396,7 +1355,6 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
@@ -1424,7 +1382,6 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
@@ -1454,7 +1411,6 @@ define half @v_fmaximum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_movk_i32 s0, 0x4800
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
@@ -1484,7 +1440,6 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16:
@@ -1523,7 +1478,6 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16_commute:
@@ -1565,7 +1519,6 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
@@ -1610,7 +1563,6 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
@@ -1652,7 +1604,6 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
@@ -1691,7 +1642,6 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
@@ -1732,7 +1682,6 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16:
@@ -1783,7 +1732,6 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16_commute:
@@ -1841,7 +1789,6 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all:
@@ -1901,7 +1848,6 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all:
@@ -1955,7 +1901,6 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1:
@@ -2004,7 +1949,6 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2:
@@ -2055,7 +1999,6 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16:
@@ -2110,7 +2053,6 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16_commute:
@@ -2172,7 +2114,6 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__fabs_all:
@@ -2236,7 +2177,6 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__fneg_all:
@@ -2294,7 +2234,6 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm1:
@@ -2349,7 +2288,6 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm2:
@@ -2402,7 +2340,6 @@ define double @v_fmaximum3_f64(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64:
@@ -2434,7 +2371,6 @@ define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[4:5], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_commute:
@@ -2505,7 +2441,6 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs0:
@@ -2538,7 +2473,6 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs1:
@@ -2571,7 +2505,6 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs2:
@@ -2604,7 +2537,6 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
@@ -2639,7 +2571,6 @@ define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg_all:
@@ -2674,7 +2605,6 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
@@ -2712,7 +2642,6 @@ define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg0:
@@ -2745,7 +2674,6 @@ define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg1:
@@ -2778,7 +2706,6 @@ define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_fneg2:
@@ -2811,7 +2738,6 @@ define double @v_fmaximum3_f64_const0(double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_const0:
@@ -2845,7 +2771,6 @@ define double @v_fmaximum3_f64__const2(double %a, double %b) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64__const2:
@@ -2879,7 +2804,6 @@ define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_inlineimm0:
@@ -2911,7 +2835,6 @@ define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64__inlineimm:
@@ -2943,7 +2866,6 @@ define double @v_fmaximum3_f64_const1_const2(double %a) {
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40300000, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
@@ -2979,7 +2901,6 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v1, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
@@ -3044,7 +2965,6 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v1, v0, v2
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f16__multi_use:
@@ -3115,7 +3035,6 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use:
@@ -3155,7 +3074,6 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[2:3], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fmaximum3_f64__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index de63b99e9139c..d9ba2de48bb01 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -11,7 +11,6 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32:
@@ -39,7 +38,6 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v2, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_commute:
@@ -95,7 +93,6 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs0:
@@ -124,7 +121,6 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, |v1|, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs1:
@@ -153,7 +149,6 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs2:
@@ -182,7 +177,6 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fabs_all:
@@ -213,7 +207,6 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg_all:
@@ -244,7 +237,6 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
@@ -278,7 +270,6 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg0:
@@ -307,7 +298,6 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, -v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg1:
@@ -336,7 +326,6 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_fneg2:
@@ -365,7 +354,6 @@ define float @v_fminimum3_f32_const0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 0x41000000, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_const0:
@@ -393,7 +381,6 @@ define float @v_fminimum3_f32__const2(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 0x41000000
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32__const2:
@@ -421,7 +408,6 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 4.0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
@@ -449,7 +435,6 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32__inlineimm:
@@ -479,7 +464,6 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
; GFX12-NEXT: s_mov_b32 s0, 0x41000000
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f32_const1_const2:
@@ -508,7 +492,6 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v4, v0, v2
; GFX12-NEXT: v_minimum3_f32 v1, v5, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32:
@@ -543,7 +526,6 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, v4
; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32_commute:
@@ -578,7 +560,6 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v2|, |v4|
; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v3|, |v5|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
@@ -616,7 +597,6 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v2, -v4
; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v3, -v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
@@ -654,7 +634,6 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v2
; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
@@ -689,7 +668,6 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, 4.0
; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
@@ -725,7 +703,6 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
; GFX12-NEXT: v_minimum3_f32 v0, v6, v0, v3
; GFX12-NEXT: v_minimum3_f32 v1, v7, v1, v4
; GFX12-NEXT: v_minimum3_f32 v2, v8, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32:
@@ -767,7 +744,6 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, v6
; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, v7
; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, v8
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32_commute:
@@ -809,7 +785,6 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v3|, |v6|
; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v4|, |v7|
; GFX12-NEXT: v_minimum3_f32 v2, |v2|, |v5|, |v8|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
@@ -854,7 +829,6 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v3, -v6
; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v4, -v7
; GFX12-NEXT: v_minimum3_f32 v2, -v2, -v5, -v8
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
@@ -899,7 +873,6 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v3
; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v4
; GFX12-NEXT: v_minimum3_f32 v2, v2, 2.0, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
@@ -941,7 +914,6 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, 4.0
; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, 4.0
; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
@@ -982,7 +954,6 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16:
@@ -1010,7 +981,6 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_commute:
@@ -1070,7 +1040,6 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs0:
@@ -1099,7 +1068,6 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs1:
@@ -1128,7 +1096,6 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs2:
@@ -1157,7 +1124,6 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fabs_all:
@@ -1188,7 +1154,6 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg_all:
@@ -1219,7 +1184,6 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
@@ -1253,7 +1217,6 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg0:
@@ -1282,7 +1245,6 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg1:
@@ -1311,7 +1273,6 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_fneg2:
@@ -1340,7 +1301,6 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_const0:
@@ -1368,7 +1328,6 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16__const2:
@@ -1396,7 +1355,6 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
@@ -1424,7 +1382,6 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16__inlineimm:
@@ -1454,7 +1411,6 @@ define half @v_fminimum3_f16_const1_const2(half %a) {
; GFX12-NEXT: s_movk_i32 s0, 0x4800
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f16_const1_const2:
@@ -1484,7 +1440,6 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v2, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16:
@@ -1523,7 +1478,6 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16_commute:
@@ -1565,7 +1519,6 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__fabs_all:
@@ -1610,7 +1563,6 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__fneg_all:
@@ -1652,7 +1604,6 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1:
@@ -1691,7 +1642,6 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2:
@@ -1732,7 +1682,6 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16:
@@ -1783,7 +1732,6 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16_commute:
@@ -1841,7 +1789,6 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__fabs_all:
@@ -1901,7 +1848,6 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__fneg_all:
@@ -1955,7 +1901,6 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1:
@@ -2004,7 +1949,6 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2:
@@ -2055,7 +1999,6 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0
; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16:
@@ -2110,7 +2053,6 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16_commute:
@@ -2172,7 +2114,6 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__fabs_all:
@@ -2236,7 +2177,6 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__fneg_all:
@@ -2294,7 +2234,6 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__inlineimm1:
@@ -2349,7 +2288,6 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_v4f16__inlineimm2:
@@ -2402,7 +2340,6 @@ define double @v_fminimum3_f64(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64:
@@ -2434,7 +2371,6 @@ define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[4:5], v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_commute:
@@ -2505,7 +2441,6 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs0:
@@ -2538,7 +2473,6 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs1:
@@ -2571,7 +2505,6 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs2:
@@ -2604,7 +2537,6 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fabs_all:
@@ -2639,7 +2571,6 @@ define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg_all:
@@ -2674,7 +2605,6 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
@@ -2712,7 +2642,6 @@ define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg0:
@@ -2745,7 +2674,6 @@ define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg1:
@@ -2778,7 +2706,6 @@ define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_fneg2:
@@ -2811,7 +2738,6 @@ define double @v_fminimum3_f64_const0(double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_const0:
@@ -2845,7 +2771,6 @@ define double @v_fminimum3_f64__const2(double %a, double %b) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64__const2:
@@ -2879,7 +2804,6 @@ define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_inlineimm0:
@@ -2911,7 +2835,6 @@ define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64__inlineimm:
@@ -2943,7 +2866,6 @@ define double @v_fminimum3_f64_const1_const2(double %a) {
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40300000, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fminimum3_f64_const1_const2:
@@ -2979,7 +2901,6 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v1, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f32__multi_use:
@@ -3044,7 +2965,6 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v1, v0, v2
; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f16__multi_use:
@@ -3115,7 +3035,6 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_minimum_f16 v1, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use:
@@ -3155,7 +3074,6 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[2:3], v[0:1], v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_no_fminimum3_f64__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index d5159adcd4f02..1914b74be1909 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -78,7 +78,6 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
@@ -93,7 +92,6 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
@@ -112,7 +110,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
@@ -127,7 +124,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -165,7 +161,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
@@ -177,7 +172,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -215,7 +209,6 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
@@ -227,7 +220,6 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -267,7 +259,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
@@ -279,7 +270,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -294,7 +284,6 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
@@ -305,7 +294,6 @@ define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
@@ -322,7 +310,6 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
@@ -334,7 +321,6 @@ define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x ha
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index f2d97fbf6ba9c..a9f8c33c0cbfa 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -133,7 +133,6 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
ret float %ret
@@ -225,7 +224,6 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
ret <2 x half> %ret
@@ -271,7 +269,6 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -318,7 +315,6 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -369,7 +365,6 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
@@ -420,7 +415,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index dfc03df40534a..0ba5b068b5da4 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -26,7 +26,6 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32:
@@ -198,7 +197,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
@@ -372,7 +370,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
@@ -556,7 +553,6 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32:
@@ -710,7 +706,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
@@ -867,7 +862,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
@@ -1473,7 +1467,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
@@ -1645,7 +1638,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
@@ -1819,7 +1811,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
@@ -2003,7 +1994,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
@@ -2157,7 +2147,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
@@ -2314,7 +2303,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
@@ -11326,7 +11314,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
@@ -11558,7 +11545,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -11792,7 +11778,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
@@ -12030,7 +12015,6 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
@@ -12240,7 +12224,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -12453,7 +12436,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
@@ -12674,7 +12656,6 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrsp
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
@@ -12910,7 +12891,6 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
@@ -13142,7 +13122,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
@@ -13472,7 +13451,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -13804,7 +13782,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
@@ -14140,7 +14117,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
@@ -14460,7 +14436,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -14783,7 +14758,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
@@ -15114,7 +15088,6 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
@@ -15448,7 +15421,6 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
@@ -15769,13 +15741,13 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX12-NEXT: s_cbranch_execz .LBB58_2
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_alu 0xfffe
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index da1494974cb57..cf8c0a36220dc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -26,7 +26,6 @@ define float @global_agent_atomic_fmax_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32:
@@ -189,7 +188,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos:
@@ -354,7 +352,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg:
@@ -519,7 +516,6 @@ define void @global_agent_atomic_fmax_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32:
@@ -676,7 +672,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos:
@@ -836,7 +831,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg:
@@ -1502,7 +1496,6 @@ define float @global_agent_atomic_fmax_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__ftz:
@@ -1665,7 +1658,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz:
@@ -1830,7 +1822,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz:
@@ -1995,7 +1986,6 @@ define void @global_agent_atomic_fmax_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__ftz:
@@ -2152,7 +2142,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz:
@@ -2312,7 +2301,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 7609f51de5fbb..c4ca6455366a5 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -26,7 +26,6 @@ define float @global_agent_atomic_fmin_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32:
@@ -189,7 +188,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos:
@@ -354,7 +352,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg:
@@ -519,7 +516,6 @@ define void @global_agent_atomic_fmin_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32:
@@ -676,7 +672,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos:
@@ -836,7 +831,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg:
@@ -1502,7 +1496,6 @@ define float @global_agent_atomic_fmin_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__ftz:
@@ -1665,7 +1658,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz:
@@ -1830,7 +1822,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz:
@@ -1995,7 +1986,6 @@ define void @global_agent_atomic_fmin_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__ftz:
@@ -2152,7 +2142,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz:
@@ -2312,7 +2301,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index c63d6e99a1040..44a2c34b06b57 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -95,7 +95,6 @@ define void @wobble() {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
@@ -114,7 +113,6 @@ define void @snork() {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 8b56c6040614b..30a7a5b56ca72 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -605,10 +605,10 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cvt_f32_u32 s4, s3
; GFX12-NEXT: s_sub_co_i32 s5, 0, s3
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -812,7 +812,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s0, exec_lo
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
@@ -820,6 +819,7 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s0, s0, 5
@@ -899,7 +899,6 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
ret void
@@ -1061,7 +1060,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12: ; %bb.0:
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1070,6 +1068,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
@@ -1080,6 +1079,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB7_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -1247,7 +1247,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_mov_b32 s1, exec_lo
; GFX12-NEXT: s_mov_b32 s0, exec_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
; GFX12-NEXT: ; implicit-def: $vgpr1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1256,6 +1255,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_i32 s1, s1, 5
@@ -1264,6 +1264,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: .LBB8_2:
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 203af74183ab7..9f093cc7b5abf 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -196,7 +196,6 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32:
@@ -217,7 +216,6 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add i32 %x, 1
@@ -391,7 +389,6 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16:
@@ -413,7 +410,6 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
@@ -609,7 +605,6 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -932,7 +927,6 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -960,7 +954,6 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -1369,7 +1362,6 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1397,7 +1389,6 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -1565,7 +1556,6 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16:
@@ -1587,7 +1577,6 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i16 %x, 1
@@ -1783,7 +1772,6 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -2106,7 +2094,6 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_v3i16:
@@ -2134,7 +2121,6 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -2543,7 +2529,6 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_v4i16:
@@ -2571,7 +2556,6 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX1200-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -2857,7 +2841,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32:
@@ -2888,7 +2871,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i32> %x, <i32 1, i32 1>
@@ -3277,7 +3259,6 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[5:6]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v3, v[6:7]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v3i32:
@@ -3319,7 +3300,6 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i32> %x, <i32 1, i32 1, i32 1>
@@ -3769,7 +3749,6 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v3, v[7:8]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v8, v4, v[8:9]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v4i32:
@@ -3819,7 +3798,6 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v3, v6, v3
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
@@ -4030,7 +4008,6 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i24:
@@ -4053,7 +4030,6 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%shl = shl i32 %x, 8
@@ -4268,7 +4244,6 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_u24:
@@ -4291,7 +4266,6 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%shl = and i32 %x, 16777215
@@ -4467,7 +4441,6 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i8:
@@ -4489,7 +4462,6 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv33 = add i8 %x, 1
@@ -4782,7 +4754,6 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX1200-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i8:
@@ -4814,7 +4785,6 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i8> %x, <i8 1, i8 1>
@@ -5274,7 +5244,6 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1]
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i64:
@@ -5319,7 +5288,6 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v6, v[2:3]
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v7, v[1:2]
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add i64 %x, 1
@@ -6146,7 +6114,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_add3_u32 v1, v9, v1, v4
; GFX1200-SDAG-NEXT: v_add3_u32 v3, v10, v3, v6
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i64:
@@ -6221,7 +6188,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v8, v10
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v2, v14, v15
; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v15, v[4:5]
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i64> %x, <i64 1, i64 1>
@@ -6479,7 +6445,6 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
; GFX1200-NEXT: global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v5, v0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
bb:
%i = add i32 %arg, 1
@@ -6713,7 +6678,6 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
; GFX1200-NEXT: global_store_b32 v[3:4], v5, off scope:SCOPE_SYS
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v5, v1
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
bb:
%i = add i32 %arg, 1
@@ -7012,7 +6976,6 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v0, v2, v[0:1]
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[1:2]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i32_x2:
@@ -7044,7 +7007,6 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
; GFX1200-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add i32 %x, 1
@@ -7564,7 +7526,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4]
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[1:2], null, v4, v2, v[4:5]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
@@ -7617,7 +7578,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0
; GFX1200-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i32> %x, <i32 1, i32 1>
@@ -7889,7 +7849,6 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_imad_pat_i16_x2:
@@ -7921,7 +7880,6 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
@@ -8187,7 +8145,6 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0
; GFX1200-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: clpeak_umad_pat_i16_x2:
@@ -8219,7 +8176,6 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%conv69 = add i16 %x, 1
@@ -8543,7 +8499,6 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -8867,7 +8822,6 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX1200-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -8943,7 +8897,6 @@ define <2 x i32> @multi_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z0, i32 %z1) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v1, v2
; GFX1200-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i32 %x, %y
@@ -9059,7 +9012,6 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3
; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: multi_use_mul_mad_i16_var:
@@ -9076,7 +9028,6 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
@@ -9148,7 +9099,6 @@ define i32 @other_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z, ptr addrspace(3) %
; GFX1200-NEXT: v_add_nc_u32_e32 v0, v1, v2
; GFX1200-NEXT: ds_store_b32 v3, v1
; GFX1200-NEXT: s_wait_dscnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i32 %x, %y
@@ -9249,7 +9199,6 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2
; GFX1200-SDAG-NEXT: ds_store_b16 v3, v4
; GFX1200-SDAG-NEXT: s_wait_dscnt 0x0
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: other_use_mul_mad_i16_var:
@@ -9264,7 +9213,6 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %
; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v1, v2
; GFX1200-GISEL-NEXT: ds_store_b16 v3, v1
; GFX1200-GISEL-NEXT: s_wait_dscnt 0x0
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul i16 %x, %y
@@ -9382,7 +9330,6 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX1200-NEXT: v_pk_mad_u16 v1, v0, v1, v3
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1200-NEXT: v_mov_b32_e32 v0, v2
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul <2 x i16> %x, %y
@@ -9510,7 +9457,6 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
; GFX1200-NEXT: v_pk_mad_u16 v0, v0, v1, v2
; GFX1200-NEXT: ds_store_b32 v3, v4
; GFX1200-NEXT: s_wait_dscnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
entry:
%mul = mul <2 x i16> %x, %y
@@ -9594,7 +9540,6 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
-; GFX1200-SDAG-NEXT: s_wait_alu 0xfffe
; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX1200-GISEL-LABEL: mul_u24_add64:
@@ -9609,7 +9554,6 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1200-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX1200-GISEL-NEXT: s_wait_alu 0xfffe
; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31]
%mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y)
%add = add i64 %mul, %z
@@ -9669,7 +9613,6 @@ define i64 @mul_u24_zext_add64(i32 %x, i32 %y, i64 %z) {
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX1200-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%mul = call i32 @llvm.amdgcn.mul.u24(i32 %x, i32 %y)
%mul.zext = zext i32 %mul to i64
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
index 67c890c279432..99f4fbf359948 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
@@ -12,7 +12,6 @@ define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -31,7 +30,6 @@ define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inr
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -48,7 +46,6 @@ define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc,
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -123,7 +120,6 @@ define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 in
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -142,7 +138,6 @@ define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -159,7 +154,6 @@ define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
main_body:
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
index 050cbb544e5ba..de484e3db18ab 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
@@ -12,7 +12,6 @@ define float @raw_buffer_load(<4 x i32> inreg) {
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: buffer_load_b32 v0, off, s[0:3], null th:TH_LOAD_LU
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
main_body:
%data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 3)
@@ -30,7 +29,6 @@ define float @struct_buffer_load(<4 x i32> inreg) {
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_load_b32 v0, v0, s[0:3], null idxen th:TH_LOAD_LU
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
main_body:
%data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 9eb747ebe7149..d3fc96d7ff801 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -28,7 +28,6 @@ define float @test_cvt_f32_bf8_byte0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e32 v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
ret float %ret
@@ -49,7 +48,6 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
ret float %ret
@@ -70,7 +68,6 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
ret float %ret
@@ -91,7 +88,6 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
ret float %ret
@@ -112,7 +108,6 @@ define float @test_cvt_f32_fp8_byte0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e32 v0, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
ret float %ret
@@ -133,7 +128,6 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
ret float %ret
@@ -154,7 +148,6 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
ret float %ret
@@ -175,7 +168,6 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
ret float %ret
@@ -196,7 +188,6 @@ define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_bf8_e32 v[0:1], v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
ret <2 x float> %ret
@@ -217,7 +208,6 @@ define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
ret <2 x float> %ret
@@ -238,7 +228,6 @@ define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
ret <2 x float> %ret
@@ -259,7 +248,6 @@ define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
ret <2 x float> %ret
@@ -283,7 +271,6 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
@@ -308,7 +295,6 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
@@ -332,7 +318,6 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
@@ -357,7 +342,6 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
@@ -381,7 +365,6 @@ define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
ret i32 %ret
@@ -405,7 +388,6 @@ define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
ret i32 %ret
@@ -430,7 +412,6 @@ define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
ret i32 %ret
@@ -455,7 +436,6 @@ define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_bf8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
ret i32 %ret
@@ -479,7 +459,6 @@ define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
ret i32 %ret
@@ -503,7 +482,6 @@ define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
ret i32 %ret
@@ -528,7 +506,6 @@ define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
ret i32 %ret
@@ -553,7 +530,6 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 byte_sel:3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
ret i32 %ret
@@ -577,7 +553,6 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
@@ -602,7 +577,6 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
@@ -627,7 +601,6 @@ define <2 x float> @test_sext_cvt_pk_f32_bf8_word1(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a.sext, i1 true)
@@ -652,7 +625,6 @@ define <2 x float> @test_sext_cvt_pk_f32_fp8_word0(i16 %a) {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_pk_f32_fp8_e32 v[0:1], v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a.sext, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
index f78b0a9c4ad2c..8ea10f4496a2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dot4.f32.ll
@@ -11,7 +11,6 @@ define float @test_amdgcn_dot4_f32_fp8_bf8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.fp8.bf8(i32 %a, i32 %b, float %c)
@@ -27,7 +26,6 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -44,7 +42,6 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -61,7 +58,6 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -79,7 +75,6 @@ define float @test_amdgcn_dot4_f32_fp8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -97,7 +92,6 @@ define float @test_amdgcn_dot4_f32_bf8_fp8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.bf8.fp8(i32 %a, i32 %b, float %c)
@@ -113,7 +107,6 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -130,7 +123,6 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -147,7 +139,6 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -165,7 +156,6 @@ define float @test_amdgcn_dot4_f32_bf8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -183,7 +173,6 @@ define float @test_amdgcn_dot4_f32_fp8_fp8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.fp8.fp8(i32 %a, i32 %b, float %c)
@@ -199,7 +188,6 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -216,7 +204,6 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -233,7 +220,6 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -251,7 +237,6 @@ define float @test_amdgcn_dot4_f32_fp8_fp8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_fp8_fp8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -269,7 +254,6 @@ define float @test_amdgcn_dot4_f32_bf8_bf8(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%ret = call float @llvm.amdgcn.dot4.f32.bf8.bf8(i32 %a, i32 %b, float %c)
@@ -285,7 +269,6 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
@@ -302,7 +285,6 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -319,7 +301,6 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fabs_fneg(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fneg.c = fneg float %c
@@ -337,7 +318,6 @@ define float @test_amdgcn_dot4_f32_bf8_bf8_fneg_fabs(i32 %a, i32 %b, float %c) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dot4_f32_bf8_bf8 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%fabs.c = call float @llvm.fabs.f32(float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index ce382942315bb..119885cba78c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -9217,7 +9217,6 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store half %v, ptr addrspace(1) %out
@@ -9256,7 +9255,6 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store half %v, ptr addrspace(1) %out
@@ -9295,7 +9293,6 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store bfloat %v, ptr addrspace(1) %out
@@ -9334,7 +9331,6 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store bfloat %v, ptr addrspace(1) %out
@@ -9373,7 +9369,6 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i16 %v, ptr addrspace(1) %out
@@ -9412,7 +9407,6 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store i16 %v, ptr addrspace(1) %out
@@ -9451,7 +9445,6 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x half> %v, ptr addrspace(1) %out
@@ -9490,7 +9483,6 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x half> %v, ptr addrspace(1) %out
@@ -9553,7 +9545,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v2f32:
@@ -9569,7 +9560,6 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x float> %v, ptr addrspace(1) %out
@@ -9632,7 +9622,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v2f32:
@@ -9648,7 +9637,6 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <2 x float> %v, ptr addrspace(1) %out
@@ -9744,7 +9732,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v7i32:
@@ -9767,7 +9754,6 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <7 x i32> %v, ptr addrspace(1) %out
@@ -9863,7 +9849,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v7i32:
@@ -9886,7 +9871,6 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-GISEL-NEXT: global_store_b96 v[0:1], v[6:8], off offset:16
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <7 x i32> %v, ptr addrspace(1) %out
@@ -9959,7 +9943,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlane16_v8i16:
@@ -9977,7 +9960,6 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1
; GFX12-GISEL-NEXT: v_permlane16_b32 v5, v5, s0, s1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <8 x i16> %v, ptr addrspace(1) %out
@@ -10050,7 +10032,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: v_permlanex16_v8i16:
@@ -10068,7 +10049,6 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1
; GFX12-GISEL-NEXT: v_permlanex16_b32 v5, v5, s0, s1
; GFX12-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <8 x i16> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
index dca743939e706..bb42834221681 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll
@@ -38,7 +38,6 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr %v, ptr addrspace(1) %out
@@ -80,7 +79,6 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr %v, ptr addrspace(1) %out
@@ -139,7 +137,6 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr> %v, ptr addrspace(1) %out
@@ -198,7 +195,6 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16
; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr> %v, ptr addrspace(1) %out
@@ -237,7 +233,6 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -276,7 +271,6 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(3) %v, ptr addrspace(1) %out
@@ -321,7 +315,6 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -366,7 +359,6 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out
@@ -405,7 +397,6 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -444,7 +435,6 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(5) %v, ptr addrspace(1) %out
@@ -489,7 +479,6 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -534,7 +523,6 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out
@@ -573,7 +561,6 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -612,7 +599,6 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store ptr addrspace(6) %v, ptr addrspace(1) %out
@@ -657,7 +643,6 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr
; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
@@ -702,7 +687,6 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s
; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1
; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1
; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
index a827548f6abeb..8a0602e0472b5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -12,7 +12,6 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -29,7 +28,6 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret <2 x bfloat> %ret
@@ -44,7 +42,6 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s6 offen offset:128
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 128
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -60,7 +57,6 @@ define void @raw_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_NT
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 2)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
index b137d3462e156..ce46e2755ae58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_nortn.ll
@@ -42,7 +42,6 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret void
@@ -86,7 +85,6 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret void
@@ -130,7 +128,6 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -174,7 +171,6 @@ define void @raw_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffs
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret void
@@ -218,7 +214,6 @@ define void @raw_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
index 46c816fb4c51a..327d80a7b67cd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd_rtn.ll
@@ -31,7 +31,6 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 24)
ret float %ret
@@ -65,7 +64,6 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset_
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, off, s[0:3], s6 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -99,7 +97,6 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
@@ -133,7 +130,6 @@ define <2 x half> @raw_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__0_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s6 offset:92 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 92, i32 %soffset, i32 0)
ret <2 x half> %ret
@@ -167,7 +163,6 @@ define float @raw_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 8fdf604d95238..6e029f7c0a95e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -223,7 +223,6 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
-; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
ret void
@@ -499,7 +498,6 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
; GCN-NEXT: global_store_b32 v[7:8], v0, off
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var:
@@ -531,7 +529,6 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0
; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -965,7 +962,6 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_barrier_join m0
-; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.barrier.join(i32 %arg)
ret void
@@ -1224,7 +1220,6 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg)
ret void
@@ -1404,7 +1399,6 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0:
@@ -1420,7 +1414,6 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0
-; GLOBAL-ISEL-NEXT: s_wait_alu 0xfffe
; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
%state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
ret i32 %state
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
index 70dff2c800a4b..bc7052132a87b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.var.ll
@@ -15,7 +15,6 @@ define void @test_s_sleep_var1(i32 %arg) {
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: s_sleep_var s0
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 %arg)
ret void
@@ -30,7 +29,6 @@ define void @test_s_sleep_var2() {
; GCN-NEXT: s_wait_bvhcnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_sleep_var 10
-; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @llvm.amdgcn.s.sleep.var(i32 10)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
index d46f21d28556f..2efade9fcbba1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
@@ -12,7 +12,6 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsr
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
@@ -27,7 +26,6 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgp
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
index 2d03e3e122a70..d5b5c71cc42a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll
@@ -46,7 +46,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -91,7 +90,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -139,7 +137,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
@@ -187,7 +184,6 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
index 7f9712a283ecb..a312a3cb0a95c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll
@@ -35,7 +35,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -70,7 +69,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffs
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -108,7 +106,6 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -146,7 +143,6 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x half> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
index 07d43ba5f2e7a..8d1dce76d2cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll
@@ -56,7 +56,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -112,7 +111,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -169,7 +167,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__0_voffs
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -225,7 +222,6 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -278,7 +274,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -331,7 +326,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -386,7 +380,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s6 idxen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -439,7 +432,6 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
index 0bcdc29d642c2..06b1a9cc70513 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll
@@ -56,7 +56,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret float %ret
@@ -112,7 +111,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -169,7 +167,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__0_voff
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret float %ret
@@ -225,7 +222,6 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret float %ret
@@ -278,7 +274,6 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
@@ -331,7 +326,6 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen offset:256
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%voffset.add = add i32 %voffset, 256
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0)
@@ -386,7 +380,6 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__0_vof
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s6 idxen
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
ret void
@@ -439,7 +432,6 @@ define void @struct_ptr_buffer_atomic_fmin_f32_noret__vgpr_val__sgpr_rsrc__vgpr_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index 7cca57e36c7f9..5ea89bc574910 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -52,7 +52,6 @@ define amdgpu_gfx void @test_wave_id_callable(ptr addrspace(1) %out) {
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%waveid = call i32 @llvm.amdgcn.wave.id()
store i32 %waveid, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index e7e0eb8ed370a..bc0daf95e329c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -74,7 +74,6 @@ define half @v_maximum_f16(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -129,7 +128,6 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -202,7 +200,6 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -257,7 +254,6 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz half @llvm.maximum.f16(half %src0, half %src1)
ret half %op
@@ -338,7 +334,6 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan half %arg0, 1.0
%op = call half @llvm.maximum.f16(half %src0, half %src1)
@@ -420,7 +415,6 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan half %arg1, 1.0
%op = call half @llvm.maximum.f16(half %src0, half %src1)
@@ -639,7 +633,6 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -701,7 +694,6 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -810,7 +802,6 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -872,7 +863,6 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -1021,7 +1011,6 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.maximum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
@@ -1156,7 +1145,6 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1229,7 +1217,6 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1362,7 +1349,6 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1435,7 +1421,6 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.maximum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1601,7 +1586,6 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1681,7 +1665,6 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1847,7 +1830,6 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1927,7 +1909,6 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x half> @llvm.maximum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -2207,7 +2188,6 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5
; GFX12-NEXT: v_pk_maximum_f16 v2, v2, v6
; GFX12-NEXT: v_pk_maximum_f16 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x half> @llvm.maximum.v8f16(<8 x half> %src0, <8 x half> %src1)
ret <8 x half> %op
@@ -2723,7 +2703,6 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX12-NEXT: v_pk_maximum_f16 v5, v5, v13
; GFX12-NEXT: v_pk_maximum_f16 v6, v6, v14
; GFX12-NEXT: v_pk_maximum_f16 v7, v7, v15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x half> @llvm.maximum.v16f16(<16 x half> %src0, <16 x half> %src1)
ret <16 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 1a9be3bddb160..6b61931fc9414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -70,7 +70,6 @@ define float @v_maximum_f32(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -121,7 +120,6 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -190,7 +188,6 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -241,7 +238,6 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz float @llvm.maximum.f32(float %src0, float %src1)
ret float %op
@@ -318,7 +314,6 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) {
; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan float %arg0, 1.0
%op = call float @llvm.maximum.f32(float %src0, float %src1)
@@ -396,7 +391,6 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) {
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan float %arg1, 1.0
%op = call float @llvm.maximum.f32(float %src0, float %src1)
@@ -580,7 +574,6 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -637,7 +630,6 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -725,7 +717,6 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -782,7 +773,6 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f32 v0, v0, v2
; GFX12-NEXT: v_maximum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x float> @llvm.maximum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -1008,7 +998,6 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1072,7 +1061,6 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1180,7 +1168,6 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1244,7 +1231,6 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
; GFX12-NEXT: v_maximum_f32 v0, v0, v3
; GFX12-NEXT: v_maximum_f32 v1, v1, v4
; GFX12-NEXT: v_maximum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x float> @llvm.maximum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1372,7 +1358,6 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1442,7 +1427,6 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1570,7 +1554,6 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1640,7 +1623,6 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
; GFX12-NEXT: v_maximum_f32 v1, v1, v5
; GFX12-NEXT: v_maximum_f32 v2, v2, v6
; GFX12-NEXT: v_maximum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x float> @llvm.maximum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1846,7 +1828,6 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v5, v5, v13
; GFX12-NEXT: v_maximum_f32 v6, v6, v14
; GFX12-NEXT: v_maximum_f32 v7, v7, v15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x float> @llvm.maximum.v8f32(<8 x float> %src0, <8 x float> %src1)
ret <8 x float> %op
@@ -2248,7 +2229,6 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX12-NEXT: v_maximum_f32 v14, v14, v30
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f32 v15, v15, v31
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x float> @llvm.maximum.v16f32(<16 x float> %src0, <16 x float> %src1)
ret <16 x float> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
index d0122891f96b1..9a83c04cad1e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
@@ -76,7 +76,6 @@ define double @v_maximum_f64(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -127,7 +126,6 @@ define double @v_maximum_f64__nnan(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -202,7 +200,6 @@ define double @v_maximum_f64__nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -253,7 +250,6 @@ define double @v_maximum_f64__nnan_nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz double @llvm.maximum.f64(double %src0, double %src1)
ret double %op
@@ -337,7 +333,6 @@ define double @v_maximum_f64__nnan_src0(double %arg0, double %src1) {
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan double %arg0, 1.0
%op = call double @llvm.maximum.f64(double %src0, double %src1)
@@ -422,7 +417,6 @@ define double @v_maximum_f64__nnan_src1(double %src0, double %arg1) {
; GFX12-NEXT: v_add_f64_e32 v[2:3], 1.0, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan double %arg1, 1.0
%op = call double @llvm.maximum.f64(double %src0, double %src1)
@@ -526,7 +520,6 @@ define void @s_maximum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.maximum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -628,7 +621,6 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -686,7 +678,6 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -787,7 +778,6 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -845,7 +835,6 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -980,7 +969,6 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.maximum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
@@ -1107,7 +1095,6 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1172,7 +1159,6 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1298,7 +1284,6 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1363,7 +1348,6 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x double> @llvm.maximum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1515,7 +1499,6 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1587,7 +1570,6 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1739,7 +1721,6 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1811,7 +1792,6 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
; GFX12-NEXT: v_maximum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_maximum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x double> @llvm.maximum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -2078,7 +2058,6 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) {
; GFX12-NEXT: v_maximum_f64 v[12:13], v[12:13], v[28:29]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[14:15], v[14:15], v[30:31]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x double> @llvm.maximum.v8f64(<8 x double> %src0, <8 x double> %src1)
ret <8 x double> %op
@@ -2955,7 +2934,6 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: v_maximum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_maximum_f64 v[30:31], v[30:31], v[86:87]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.maximum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index c237c0d1de2c9..77b5682a2dbd1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -61,7 +61,6 @@ define half @v_minimum_f16(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -106,7 +105,6 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -166,7 +164,6 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -211,7 +208,6 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz half @llvm.minimum.f16(half %src0, half %src1)
ret half %op
@@ -278,7 +274,6 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan half %arg0, 1.0
%op = call half @llvm.minimum.f16(half %src0, half %src1)
@@ -346,7 +341,6 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan half %arg1, 1.0
%op = call half @llvm.minimum.f16(half %src0, half %src1)
@@ -528,7 +522,6 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -575,7 +568,6 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -664,7 +656,6 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -711,7 +702,6 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
ret <2 x half> %op
@@ -833,7 +823,6 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.minimum.v2f16(<2 x half> %src0, <2 x half> %src1)
%cast = bitcast <2 x half> %op to i32
@@ -941,7 +930,6 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -994,7 +982,6 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1100,7 +1087,6 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1153,7 +1139,6 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x half> @llvm.minimum.v3f16(<3 x half> %src0, <3 x half> %src1)
ret <3 x half> %op
@@ -1285,7 +1270,6 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1340,7 +1324,6 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1472,7 +1455,6 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1527,7 +1509,6 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x half> @llvm.minimum.v4f16(<4 x half> %src0, <4 x half> %src1)
ret <4 x half> %op
@@ -1745,7 +1726,6 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5
; GFX12-NEXT: v_pk_minimum_f16 v2, v2, v6
; GFX12-NEXT: v_pk_minimum_f16 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x half> @llvm.minimum.v8f16(<8 x half> %src0, <8 x half> %src1)
ret <8 x half> %op
@@ -2141,7 +2121,6 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
; GFX12-NEXT: v_pk_minimum_f16 v5, v5, v13
; GFX12-NEXT: v_pk_minimum_f16 v6, v6, v14
; GFX12-NEXT: v_pk_minimum_f16 v7, v7, v15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x half> @llvm.minimum.v16f16(<16 x half> %src0, <16 x half> %src1)
ret <16 x half> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index d981cc44903de..8753dc50c4da4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -70,7 +70,6 @@ define float @v_minimum_f32(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -121,7 +120,6 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -190,7 +188,6 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -241,7 +238,6 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz float @llvm.minimum.f32(float %src0, float %src1)
ret float %op
@@ -318,7 +314,6 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) {
; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan float %arg0, 1.0
%op = call float @llvm.minimum.f32(float %src0, float %src1)
@@ -396,7 +391,6 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) {
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan float %arg1, 1.0
%op = call float @llvm.minimum.f32(float %src0, float %src1)
@@ -580,7 +574,6 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -637,7 +630,6 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -725,7 +717,6 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -782,7 +773,6 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f32 v0, v0, v2
; GFX12-NEXT: v_minimum_f32 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x float> @llvm.minimum.v2f32(<2 x float> %src0, <2 x float> %src1)
ret <2 x float> %op
@@ -1008,7 +998,6 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1072,7 +1061,6 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1180,7 +1168,6 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1244,7 +1231,6 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
; GFX12-NEXT: v_minimum_f32 v0, v0, v3
; GFX12-NEXT: v_minimum_f32 v1, v1, v4
; GFX12-NEXT: v_minimum_f32 v2, v2, v5
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x float> @llvm.minimum.v3f32(<3 x float> %src0, <3 x float> %src1)
ret <3 x float> %op
@@ -1372,7 +1358,6 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1442,7 +1427,6 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1570,7 +1554,6 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1640,7 +1623,6 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
; GFX12-NEXT: v_minimum_f32 v1, v1, v5
; GFX12-NEXT: v_minimum_f32 v2, v2, v6
; GFX12-NEXT: v_minimum_f32 v3, v3, v7
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x float> @llvm.minimum.v4f32(<4 x float> %src0, <4 x float> %src1)
ret <4 x float> %op
@@ -1846,7 +1828,6 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v5, v5, v13
; GFX12-NEXT: v_minimum_f32 v6, v6, v14
; GFX12-NEXT: v_minimum_f32 v7, v7, v15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x float> @llvm.minimum.v8f32(<8 x float> %src0, <8 x float> %src1)
ret <8 x float> %op
@@ -2248,7 +2229,6 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
; GFX12-NEXT: v_minimum_f32 v14, v14, v30
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f32 v15, v15, v31
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x float> @llvm.minimum.v16f32(<16 x float> %src0, <16 x float> %src1)
ret <16 x float> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
index 7cf68fdddf356..81b892d424b46 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
@@ -76,7 +76,6 @@ define double @v_minimum_f64(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -127,7 +126,6 @@ define double @v_minimum_f64__nnan(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -202,7 +200,6 @@ define double @v_minimum_f64__nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -253,7 +250,6 @@ define double @v_minimum_f64__nnan_nsz(double %src0, double %src1) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz double @llvm.minimum.f64(double %src0, double %src1)
ret double %op
@@ -337,7 +333,6 @@ define double @v_minimum_f64__nnan_src0(double %arg0, double %src1) {
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src0 = fadd nnan double %arg0, 1.0
%op = call double @llvm.minimum.f64(double %src0, double %src1)
@@ -422,7 +417,6 @@ define double @v_minimum_f64__nnan_src1(double %src0, double %arg1) {
; GFX12-NEXT: v_add_f64_e32 v[2:3], 1.0, v[2:3]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%src1 = fadd nnan double %arg1, 1.0
%op = call double @llvm.minimum.f64(double %src0, double %src1)
@@ -526,7 +520,6 @@ define void @s_minimum_f64(double inreg %src0, double inreg %src1) {
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:1]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call double @llvm.minimum.f64(double %src0, double %src1)
call void asm sideeffect "; use $0", "s"(double %op)
@@ -628,7 +621,6 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -686,7 +678,6 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -787,7 +778,6 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -845,7 +835,6 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double>
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
ret <2 x double> %op
@@ -980,7 +969,6 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1)
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use v[0:3]
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x double> @llvm.minimum.v2f64(<2 x double> %src0, <2 x double> %src1)
call void asm sideeffect "; use $0", "s"(<2 x double> %op)
@@ -1107,7 +1095,6 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1172,7 +1159,6 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1298,7 +1284,6 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1363,7 +1348,6 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double>
; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[6:7]
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[8:9]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[10:11]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <3 x double> @llvm.minimum.v3f64(<3 x double> %src0, <3 x double> %src1)
ret <3 x double> %op
@@ -1515,7 +1499,6 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1587,7 +1570,6 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1739,7 +1721,6 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -1811,7 +1792,6 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double>
; GFX12-NEXT: v_minimum_f64 v[2:3], v[2:3], v[10:11]
; GFX12-NEXT: v_minimum_f64 v[4:5], v[4:5], v[12:13]
; GFX12-NEXT: v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call nnan nsz <4 x double> @llvm.minimum.v4f64(<4 x double> %src0, <4 x double> %src1)
ret <4 x double> %op
@@ -2078,7 +2058,6 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) {
; GFX12-NEXT: v_minimum_f64 v[12:13], v[12:13], v[28:29]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[14:15], v[14:15], v[30:31]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <8 x double> @llvm.minimum.v8f64(<8 x double> %src0, <8 x double> %src1)
ret <8 x double> %op
@@ -2955,7 +2934,6 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1)
; GFX12-NEXT: v_minimum_f64 v[28:29], v[28:29], v[84:85]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_minimum_f64 v[30:31], v[30:31], v[86:87]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%op = call <16 x double> @llvm.minimum.v16f64(<16 x double> %src0, <16 x double> %src1)
ret <16 x double> %op
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 28e1808b76e73..53ea253035655 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -127,7 +127,6 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -324,7 +323,6 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
@@ -782,7 +780,6 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
@@ -855,7 +852,6 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 4c59b77e52205..0221f9992ad43 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2385,12 +2385,11 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index d27f9806f2e04..d5947aa790cef 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2954,7 +2954,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
; GFX12-NEXT: s_lshr_b32 s29, s10, 16
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
; GFX12-NEXT: s_lshr_b32 s28, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
@@ -3446,7 +3445,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s31
; GFX12-NEXT: s_ashr_i32 s29, s10, 16
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s30
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s29
; GFX12-NEXT: s_ashr_i32 s28, s11, 16
; GFX12-NEXT: s_sext_i32_i16 s11, s11
@@ -5791,11 +5789,10 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_nop 0
@@ -6019,10 +6016,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
@@ -6368,12 +6365,11 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
@@ -6961,10 +6957,10 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s10, s5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
; GFX12-NEXT: s_lshr_b32 s5, s5, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
@@ -6987,8 +6983,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:96
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
@@ -6999,8 +6995,8 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
@@ -8049,10 +8045,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_and_b32 s18, s15, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s18
; GFX12-NEXT: s_lshr_b32 s15, s15, 16
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s15, s14, 16
; GFX12-NEXT: s_and_b32 s14, s14, 0xffff
@@ -8063,8 +8059,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s14, s13, 16
; GFX12-NEXT: s_and_b32 s13, s13, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:224
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s13
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s14
; GFX12-NEXT: s_lshr_b32 s13, s12, 16
; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
@@ -8075,8 +8071,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s12, s11, 16
; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:192
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s11
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s12
; GFX12-NEXT: s_lshr_b32 s11, s10, 16
; GFX12-NEXT: s_and_b32 s10, s10, 0xffff
@@ -8087,8 +8083,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s10, s9, 16
; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:160
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s9
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s10
; GFX12-NEXT: s_lshr_b32 s9, s8, 16
; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
@@ -8099,8 +8095,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s8, s7, 16
; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:128
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s8
; GFX12-NEXT: s_lshr_b32 s7, s6, 16
; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
@@ -8111,8 +8107,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s6, s5, 16
; GFX12-NEXT: s_and_b32 s5, s5, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s6
; GFX12-NEXT: s_lshr_b32 s5, s4, 16
; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
@@ -8123,8 +8119,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s4, s3, 16
; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: s_lshr_b32 s3, s2, 16
; GFX12-NEXT: s_and_b32 s2, s2, 0xffff
@@ -8135,8 +8131,8 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_lshr_b32 s2, s1, 16
; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshr_b32 s1, s0, 16
; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
@@ -8944,15 +8940,14 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000
; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s67
; GFX12-NEXT: v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s65
; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s61
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s13
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s59
; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s57
; GFX12-NEXT: v_dual_mov_b32 v12, s56 :: v_dual_mov_b32 v15, s55
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index aa80081f84ffe..7f26ad7009e44 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -6257,12 +6257,11 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX12-NEXT: s_lshr_b32 s5, s3, 24
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_lshr_b32 s4, s2, 24
; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
@@ -6867,12 +6866,11 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_lshr_b32 s3, s7, 24
; GFX12-NEXT: s_lshr_b32 s2, s5, 24
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
@@ -7985,12 +7983,11 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_bfe_u32 s10, s7, 0x80010
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s10
; GFX12-NEXT: s_lshr_b32 s11, s7, 24
; GFX12-NEXT: s_lshr_b32 s10, s5, 24
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: s_bfe_u32 s11, s5, 0x80010
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s7
@@ -10012,7 +10009,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
@@ -10580,7 +10576,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
@@ -11607,7 +11602,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s5
; GFX12-NEXT: v_and_b32_e64 v13, 0xff, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v14, 0xff, s25
; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s0
; GFX12-NEXT: v_and_b32_e64 v15, 0xff, s23
@@ -11618,7 +11612,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s2
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_and_b32_e64 v19, 0xff, s17
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index b755c439b5250..aeb88bcbe0028 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -27,7 +27,6 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32:
@@ -144,7 +143,6 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32__offset:
@@ -262,7 +260,6 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32:
@@ -377,7 +374,6 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32__offset:
@@ -4950,7 +4946,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2f16:
@@ -5175,7 +5170,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2f16__offset:
@@ -5400,7 +5394,6 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2f16:
@@ -5616,7 +5609,6 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2f16__offset:
@@ -5838,7 +5830,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2bf16:
@@ -6147,7 +6138,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_v2bf16__offset:
@@ -6457,7 +6447,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2bf16:
@@ -6756,7 +6745,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_v2bf16__ofset:
@@ -7054,28 +7042,26 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s1, s5, 4
; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12-NEXT: s_cbranch_execz .LBB28_2
; GFX12-NEXT: ; %bb.1:
-; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB28_2:
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
@@ -7087,11 +7073,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_cbranch_execz .LBB28_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_f32 v2, v1
@@ -7932,25 +7917,23 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x8
; GFX12-NEXT: s_mov_b32 s6, exec_lo
; GFX12-NEXT: ; implicit-def: $vgpr1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s1, s5, 4
; GFX12-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12-NEXT: s_cbranch_execz .LBB29_2
; GFX12-NEXT: ; %bb.1:
-; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_bcnt1_i32_b32 s5, s6
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: .LBB29_2:
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_mov_b32 s7, exec_lo
@@ -7964,11 +7947,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_cbranch_execz .LBB29_4
; GFX12-NEXT: ; %bb.3:
; GFX12-NEXT: s_bcnt1_i32_b32 s0, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: .LBB29_4:
@@ -8792,7 +8774,6 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode:
@@ -8909,7 +8890,6 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 86d0eda70ff36..cc79db1b20af4 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -27,7 +27,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
@@ -118,7 +117,6 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__offset:
@@ -211,7 +209,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32:
@@ -302,7 +299,6 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__offset:
@@ -400,7 +396,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64:
@@ -499,7 +494,6 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f64__offset:
@@ -600,7 +594,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64:
@@ -699,7 +692,6 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f64__offset:
@@ -7124,7 +7116,6 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode:
@@ -7215,7 +7206,6 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 1b112aff833a0..1ffd93e35d8cd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -27,7 +27,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32:
@@ -118,7 +117,6 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__offset:
@@ -211,7 +209,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32:
@@ -302,7 +299,6 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__offset:
@@ -400,7 +396,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64:
@@ -499,7 +494,6 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f64__offset:
@@ -600,7 +594,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64:
@@ -699,7 +692,6 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f64__offset:
@@ -7124,7 +7116,6 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode:
@@ -7215,7 +7206,6 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index 74710fb1aa01b..acba2841a7107 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -290,7 +290,6 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%id.x = call i32 @llvm.amdgcn.workgroup.id.x()
%id.y = call i32 @llvm.amdgcn.workgroup.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
index f253b2a80cf9d..e81b2d0c6c9e0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -185,7 +185,6 @@ define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%id.x = call i32 @llvm.amdgcn.workgroup.id.x()
%id.y = call i32 @llvm.amdgcn.workgroup.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 62a6edd9e743d..8eb0a46cc8b17 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -53,7 +53,6 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -106,7 +105,6 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -159,7 +157,6 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
@@ -212,7 +209,6 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
%sext1 = zext i32 %arg1 to i64
@@ -399,7 +395,6 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i128
%sext1 = sext i32 %arg1 to i128
@@ -452,7 +447,6 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i63
%sext1 = sext i32 %arg1 to i63
@@ -520,7 +514,6 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 31
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i31 %arg0 to i63
%sext1 = sext i31 %arg1 to i63
@@ -596,7 +589,6 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
@@ -649,7 +641,6 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 4294967295
@@ -724,7 +715,6 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 8589934591
%trunc.rhs = and i64 %arg1, 4294967295
@@ -800,7 +790,6 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5]
; GFX12-NEXT: v_and_b32_e32 v2, 1, v3
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 8589934591
@@ -853,7 +842,6 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%shl.lhs = shl i64 %arg0, 32
%trunc.lhs = ashr i64 %shl.lhs, 32
@@ -909,7 +897,6 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%tmp4 = lshr i64 %arg0, 32
%tmp5 = and i64 %arg0, 4294967295
@@ -1076,7 +1063,6 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1192,7 +1178,6 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1274,7 +1259,6 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0
; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
%sext1 = sext i32 %arg1 to i64
@@ -1347,7 +1331,6 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%m = mul i48 %arg0, %arg1
%a = add i48 %m, %arg2
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 2e73f37458242..0889f8ef6316e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2595,7 +2595,6 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX12-NEXT: s_branch .LBB16_2
; GFX12-NEXT: .LBB16_4:
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: .LBB16_5: ; %endif
; GFX12-NEXT: s_mov_b32 s3, 0x31016000
@@ -2928,7 +2927,6 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: s_mov_b32 s14, s8
; GFX12-NEXT: s_mov_b32 s2, s9
; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[12:13]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_u64 s[20:21], s[2:3], s[12:13]
; GFX12-NEXT: s_mov_b32 s12, s23
; GFX12-NEXT: s_mov_b32 s16, s5
@@ -2945,13 +2943,10 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-NEXT: s_mov_b32 s25, s6
; GFX12-NEXT: s_add_nc_u64 s[6:7], s[12:13], s[18:19]
; GFX12-NEXT: s_mov_b32 s23, s13
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7]
; GFX12-NEXT: s_or_b64 s[8:9], s[22:23], s[24:25]
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: s_mov_b32 s3, 0x31016000
; GFX12-NEXT: s_mov_b32 s2, -1
@@ -3359,7 +3354,6 @@ define i32 @mul_pow2_plus_1(i32 %val) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: mul_pow2_plus_1:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 97d4f4696e827..c7d8e7d7db70a 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -44,7 +44,6 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 1
%load = load i8, ptr %gep, align 4
@@ -84,7 +83,6 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 2047
%load = load i8, ptr %gep, align 4
@@ -124,7 +122,6 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
@@ -168,7 +165,6 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
@@ -230,7 +226,6 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
@@ -292,7 +287,6 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -2048
%load = load i8, ptr %gep, align 4
@@ -336,7 +330,6 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
@@ -380,7 +373,6 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
@@ -424,7 +416,6 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8388608
%load = load i8, ptr %gep, align 4
@@ -465,7 +456,6 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 4095
%load = load i8, ptr %gep, align 4
@@ -509,7 +499,6 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
@@ -571,7 +560,6 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
@@ -635,7 +623,6 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
@@ -667,7 +654,6 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 16777214
%load = load i8, ptr %gep, align 4
@@ -711,7 +697,6 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -4096
%load = load i8, ptr %gep, align 4
@@ -755,7 +740,6 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -8192
%load = load i8, ptr %gep, align 4
@@ -799,7 +783,6 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -16384
%load = load i8, ptr %gep, align 4
@@ -845,7 +828,6 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
@@ -859,7 +841,6 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -16777215
%load = load i8, ptr %gep, align 4
@@ -906,7 +887,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
@@ -965,7 +945,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936639
%load = load i8, ptr %gep, align 4
@@ -1012,7 +991,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
@@ -1071,7 +1049,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589936640
%load = load i8, ptr %gep, align 4
@@ -1118,7 +1095,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
@@ -1177,7 +1153,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938687
%load = load i8, ptr %gep, align 4
@@ -1224,7 +1199,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
@@ -1283,7 +1257,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589938688
%load = load i8, ptr %gep, align 4
@@ -1330,7 +1303,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
@@ -1389,7 +1361,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942783
%load = load i8, ptr %gep, align 4
@@ -1436,7 +1407,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
@@ -1495,7 +1465,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 8589942784
%load = load i8, ptr %gep, align 4
@@ -1543,7 +1512,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1602,7 +1570,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773761
%load = load i8, ptr %gep, align 4
@@ -1650,7 +1617,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1709,7 +1675,6 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854773760
%load = load i8, ptr %gep, align 4
@@ -1757,7 +1722,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1816,7 +1780,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771713
%load = load i8, ptr %gep, align 4
@@ -1864,7 +1827,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1923,7 +1885,6 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854771712
%load = load i8, ptr %gep, align 4
@@ -1971,7 +1932,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -2030,7 +1990,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767617
%load = load i8, ptr %gep, align 4
@@ -2078,7 +2037,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -2137,7 +2095,6 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1]
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr %p, i64 -9223372036854767616
%load = load i8, ptr %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index b0d10aa24ce69..713583f29300e 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -42,7 +42,6 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 1
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -80,7 +79,6 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -120,7 +118,6 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
@@ -173,7 +170,6 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
@@ -244,7 +240,6 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8388607
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
@@ -309,7 +304,6 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-2048
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -349,7 +343,6 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -393,7 +386,6 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -437,7 +429,6 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8388608
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -477,7 +468,6 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
@@ -530,7 +520,6 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
@@ -601,7 +590,6 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:16383
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
@@ -674,7 +662,6 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
@@ -715,7 +702,6 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8388606
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -755,7 +741,6 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-4096
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -799,7 +784,6 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-8192
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -843,7 +827,6 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_u8 v0, v[0:1], off offset:-16384
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -889,7 +872,6 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
@@ -930,7 +912,6 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8388607
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -996,7 +977,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
@@ -1037,7 +1017,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1102,7 +1081,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
@@ -1143,7 +1121,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1208,7 +1185,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
@@ -1249,7 +1225,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1314,7 +1289,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
@@ -1355,7 +1329,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1420,7 +1393,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
@@ -1461,7 +1433,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1526,7 +1497,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
@@ -1567,7 +1537,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1632,7 +1601,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
@@ -1674,7 +1642,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386561
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1739,7 +1706,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
@@ -1781,7 +1747,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8386560
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1846,7 +1811,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
@@ -1888,7 +1852,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384513
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -1953,7 +1916,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
@@ -1995,7 +1957,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8384512
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -2060,7 +2021,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
@@ -2102,7 +2062,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380417
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
%load = load i8, ptr addrspace(1) %gep, align 4
@@ -2167,7 +2126,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
@@ -2209,7 +2167,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:-8380416
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
%load = load i8, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index fa8be7ee1b33e..a642543c3780d 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -7,15 +7,14 @@ define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_cmp_lt_f32 s0, 0xc2fc0000
; GFX12-NEXT: s_cselect_b32 s1, 0x42800000, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_add_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_exp_f32 s0, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.exp2.f32(float %src)
@@ -60,15 +59,14 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
; GFX12: ; %bb.0:
; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-NEXT: s_sub_f32 s0, s0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%result = call float @llvm.log2.f32(float %src)
@@ -169,24 +167,24 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mul_f32 s1, s0, 0x4f800000
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(TRANS32_DEP_1)
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cselect_b32 s1, s1, s0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: v_s_sqrt_f32 s2, s1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
; GFX12-SDAG-NEXT: s_mov_b32 s4, s1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s2, -1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_fmac_f32 s4, s5, s2
; GFX12-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_cmp_le_f32 s4, 0
; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s2
; GFX12-SDAG-NEXT: s_add_co_i32 s4, s2, 1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_xor_b32 s6, s4, 0x80000000
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; GFX12-SDAG-NEXT: s_fmac_f32 s5, s6, s2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s5, 0
; GFX12-SDAG-NEXT: s_cselect_b32 s2, s4, s3
; GFX12-SDAG-NEXT: s_cmp_lt_f32 s0, 0xf800000
@@ -207,32 +205,32 @@ define amdgpu_cs float @v_s_sqrt_f32(float inreg %src) {
; GFX12-GISEL-NEXT: s_cmp_gt_f32 0xf800000, s0
; GFX12-GISEL-NEXT: s_mul_f32 s2, s0, 0x4f800000
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s0, s2, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_s_sqrt_f32 s2, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_mov_b32 s4, s0
; GFX12-GISEL-NEXT: s_mov_b32 s6, s0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_add_co_i32 s3, s2, -1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_xor_b32 s5, s3, 0x80000000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_fmac_f32 s4, s5, s2
; GFX12-GISEL-NEXT: s_add_co_i32 s5, s2, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_xor_b32 s7, s5, 0x80000000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
; GFX12-GISEL-NEXT: s_cmp_le_f32 s4, 0
; GFX12-GISEL-NEXT: s_fmac_f32 s6, s7, s2
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s3, s2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_3)
; GFX12-GISEL-NEXT: s_cmp_gt_f32 s6, 0
; GFX12-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX12-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX12-GISEL-NEXT: s_mul_f32 s3, s2, 0x37800000
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: s_cselect_b32 s1, s3, s2
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s1
; GFX12-GISEL-NEXT: v_cmp_class_f32_e64 s1, s0, 0x260
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX12-GISEL-NEXT: ; return to shader part epilog
%result = call float @llvm.sqrt.f32(float %src)
@@ -276,15 +274,14 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
; GFX12-LABEL: srcmods_abs_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_bitset0_b32 s0, 31
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mul_f32 s0, s0, s1
; GFX12-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX12-NEXT: v_s_log_f32 s0, s0
+; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_sub_f32 s0, s0, s1
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
@@ -301,30 +298,28 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
; GFX12-SDAG-NEXT: s_xor_b32 s1, s0, 0x80000000
; GFX12-SDAG-NEXT: s_cmp_gt_f32 s0, 0x80800000
; GFX12-SDAG-NEXT: s_cselect_b32 s0, 0x4f800000, 1.0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_mul_f32 s0, s1, s0
; GFX12-SDAG-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX12-SDAG-NEXT: v_s_log_f32 s0, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: s_sub_f32 s0, s0, s1
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: srcmods_neg_f32:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_cmp_lt_f32 s0, 0x800000
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_mul_f32 s0, s0, s1
; GFX12-GISEL-NEXT: s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
; GFX12-GISEL-NEXT: v_s_log_f32 s0, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_sub_f32 s0, s0, s1
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
index faafb77782cae..50a3336a7483c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -31,7 +31,6 @@ define float @v_test_fmin_legacy_ule_f32_safe(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -61,7 +60,6 @@ define float @v_test_fmin_legacy_ule_f32_nnan_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nnan i1 %cmp, float %a, float %b
@@ -91,7 +89,6 @@ define float @v_test_fmin_legacy_ule_f32_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nsz i1 %cmp, float %a, float %b
@@ -119,7 +116,6 @@ define float @v_test_fmin_legacy_ule_f32_nnan_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule float %a, %b
%val = select nnan nsz i1 %cmp, float %a, float %b
@@ -149,7 +145,6 @@ define float @v_test_fmax_legacy_uge_f32_safe(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select i1 %cmp, float %a, float %b
@@ -179,7 +174,6 @@ define float @v_test_fmax_legacy_uge_f32_nnan_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nnan i1 %cmp, float %a, float %b
@@ -209,7 +203,6 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nsz i1 %cmp, float %a, float %b
@@ -237,7 +230,6 @@ define float @v_test_fmax_legacy_uge_f32_nnan_nsz_flag(float %a, float %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge float %a, %b
%val = select nnan nsz i1 %cmp, float %a, float %b
@@ -272,7 +264,6 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_safe(<2 x float> %a, <2 x float
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -307,7 +298,6 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_flag(<2 x float> %a, <2 x
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -342,7 +332,6 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nsz_flag(<2 x float> %a, <2 x f
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -377,7 +366,6 @@ define <2 x float> @v_test_fmin_legacy_ule_v2f32_nnan_nsz_flag(<2 x float> %a, <
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x float> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -412,7 +400,6 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_safe(<2 x float> %a, <2 x float
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -447,7 +434,6 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_flag(<2 x float> %a, <2 x
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -482,7 +468,6 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nsz_flag(<2 x float> %a, <2 x f
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -517,7 +502,6 @@ define <2 x float> @v_test_fmax_legacy_uge_v2f32_nnan_nsz_flag(<2 x float> %a, <
; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
; GFX12-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v3
; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x float> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
@@ -551,7 +535,6 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select i1 %cmp, half %a, half %b
@@ -585,7 +568,6 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nnan i1 %cmp, half %a, half %b
@@ -619,7 +601,6 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nsz i1 %cmp, half %a, half %b
@@ -651,7 +632,6 @@ define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select nnan nsz i1 %cmp, half %a, half %b
@@ -685,7 +665,6 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select i1 %cmp, half %a, half %b
@@ -719,7 +698,6 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nnan i1 %cmp, half %a, half %b
@@ -753,7 +731,6 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nsz i1 %cmp, half %a, half %b
@@ -785,7 +762,6 @@ define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge half %a, %b
%val = select nnan nsz i1 %cmp, half %a, half %b
@@ -836,7 +812,6 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -887,7 +862,6 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -938,7 +912,6 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal
; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -975,7 +948,6 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_nsz_flag(<2 x half> %a, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <2 x half> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1026,7 +998,6 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1077,7 +1048,6 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1128,7 +1098,6 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal
; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX12-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1165,7 +1134,6 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_nsz_flag(<2 x half> %a, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <2 x half> %a, %b
%val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -1241,7 +1209,6 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1317,7 +1284,6 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1393,7 +1359,6 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1442,7 +1407,6 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_nsz_flag(<4 x half> %a, <4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule <4 x half> %a, %b
%val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1518,7 +1482,6 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1594,7 +1557,6 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1670,7 +1632,6 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
; GFX12-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1719,7 +1680,6 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_nsz_flag(<4 x half> %a, <4
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v3
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp uge <4 x half> %a, %b
%val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1753,7 +1713,6 @@ define float @v_test_fmin_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float
; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a = fadd nnan float %arg0, %arg0
%b = fadd nnan float %arg1, %arg1
@@ -1789,7 +1748,6 @@ define float @v_test_fmax_legacy_uge_f32_nsz_flag__nnan_srcs(float %arg0, float
; GFX12-NEXT: v_dual_add_f32 v0, v0, v0 :: v_dual_add_f32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1
-; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a = fadd nnan float %arg0, %arg0
%b = fadd nnan float %arg1, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
index d84fe9bf21d65..276ccf0ae9869 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -781,42 +781,51 @@ frameInfo:
body: |
; GCN-O0-LABEL: name: hazard_calls
; GCN-O0: bb.0:
+ ; GCN-O0-NEXT: $sgpr16 = S_MOV_B32 0
; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: S_SETPC_B64 $sgpr0_sgpr1
; GCN-O0-NEXT: {{ $}}
; GCN-O0-NEXT: bb.1:
+ ; GCN-O0-NEXT: $sgpr18 = S_MOV_B32 0
; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
; GCN-O0-NEXT: {{ $}}
; GCN-O0-NEXT: bb.2:
; GCN-O0-NEXT: successors: %bb.3(0x80000000)
; GCN-O0-NEXT: {{ $}}
+ ; GCN-O0-NEXT: $sgpr20 = S_MOV_B32 0
; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: $sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: {{ $}}
; GCN-O0-NEXT: bb.3:
; GCN-O0-NEXT: successors: %bb.4(0x80000000)
; GCN-O0-NEXT: {{ $}}
- ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: {{ $}}
; GCN-O0-NEXT: bb.4:
+ ; GCN-O0-NEXT: $sgpr22 = S_MOV_B32 $sgpr8
+ ; GCN-O0-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O0-NEXT: S_ENDPGM 0
;
; GCN-O2-LABEL: name: hazard_calls
; GCN-O2: bb.0:
+ ; GCN-O2-NEXT: $sgpr16 = S_MOV_B32 0
; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O2-NEXT: S_SETPC_B64 $sgpr0_sgpr1
; GCN-O2-NEXT: {{ $}}
; GCN-O2-NEXT: bb.1:
+ ; GCN-O2-NEXT: $sgpr18 = S_MOV_B32 0
; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O2-NEXT: S_SETPC_B64_return $sgpr0_sgpr1
; GCN-O2-NEXT: {{ $}}
; GCN-O2-NEXT: bb.2:
; GCN-O2-NEXT: successors: %bb.3(0x80000000)
; GCN-O2-NEXT: {{ $}}
+ ; GCN-O2-NEXT: $sgpr20 = S_MOV_B32 0
; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
; GCN-O2-NEXT: $sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
@@ -829,14 +838,19 @@ body: |
; GCN-O2-NEXT: $sgpr8_sgpr9 = S_CALL_B64 0
; GCN-O2-NEXT: {{ $}}
; GCN-O2-NEXT: bb.4:
+ ; GCN-O2-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-O2-NEXT: $sgpr22 = S_MOV_B32 $sgpr8
; GCN-O2-NEXT: S_ENDPGM 0
bb.0:
+ $sgpr16 = S_MOV_B32 0
S_SETPC_B64 $sgpr0_sgpr1
bb.1:
+ $sgpr18 = S_MOV_B32 0
S_SETPC_B64_return $sgpr0_sgpr1
bb.2:
+ $sgpr20 = S_MOV_B32 0
$sgpr4_sgpr5 = S_SWAPPC_B64 $sgpr2_sgpr3
$sgpr4 = S_ADD_U32 $sgpr4, 0, implicit-def $scc
@@ -844,5 +858,6 @@ body: |
$sgpr8_sgpr9 = S_CALL_B64 0
bb.4:
+ $sgpr22 = S_MOV_B32 $sgpr8
S_ENDPGM 0
...
>From b80da10317d51964e5fe6cf5792bc074bc0b9cea Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 24 Jul 2024 16:05:54 +0900
Subject: [PATCH 3/3] Address reviewer comments.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 29 ++++++++++---------
.../CodeGen/AMDGPU/valu-read-sgpr-hazard.mir | 4 +--
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 45c2624e43d4c..d6b0f682c1f84 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2919,7 +2919,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return true;
}
-static unsigned baseSGPRNumber(Register Reg, const SIRegisterInfo &TRI) {
+// Return the numeric ID 0-63 of an 64b SGPR pair for a given SGPR.
+// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
+static unsigned sgprPairNumber(Register Reg, const SIRegisterInfo &TRI) {
unsigned RegN = TRI.getEncodingValue(Reg);
assert(RegN <= 127);
return (RegN >> 1) & 0x3f;
@@ -2965,13 +2967,14 @@ void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
for (auto &MI : reverse(MBB->instrs())) {
bool IsVALU = SIInstrInfo::isVALU(MI);
bool IsSALU = SIInstrInfo::isSALU(MI);
- if (!(IsVALU || IsSALU))
+ if (!IsVALU && !IsSALU)
continue;
for (const MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
Register Reg = Op.getReg();
+ assert(!Op.getSubReg());
// Only consider implicit operands of VCC.
if (Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
@@ -2980,7 +2983,7 @@ void GCNHazardRecognizer::computeVALUHazardSGPRs(MachineFunction *MMF) {
continue;
if (TRI.getEncodingValue(Reg) >= SGPR_NULL)
continue;
- unsigned RegN = baseSGPRNumber(Reg, TRI);
+ unsigned RegN = sgprPairNumber(Reg, TRI);
if (IsVALU && Op.isUse()) {
// Note: any access within a cycle must be considered a hazard.
if (InCycle || (ReadSGPRs[RegN] && SALUWriteSGPRs[RegN]))
@@ -3054,10 +3057,9 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
// All SGPR writes before a call/return must be flushed as the callee/caller
// will not will not see the hazard chain, i.e. (2) to (3) described above.
- const bool IsSetPC = (MI->getOpcode() == AMDGPU::S_SETPC_B64 ||
- MI->getOpcode() == AMDGPU::S_SETPC_B64_return ||
- MI->getOpcode() == AMDGPU::S_SWAPPC_B64 ||
- MI->getOpcode() == AMDGPU::S_CALL_B64);
+ const bool IsSetPC = (MI->isCall() || MI->isReturn()) &&
+ !(MI->getOpcode() == AMDGPU::S_ENDPGM ||
+ MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
// Collect all SGPR sources for MI which are read by a VALU.
const unsigned SGPR_NULL = TRI.getEncodingValue(AMDGPU::SGPR_NULL_gfx11plus);
@@ -3080,7 +3082,7 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (TRI.getEncodingValue(OpReg) >= SGPR_NULL)
continue;
- unsigned RegN = baseSGPRNumber(OpReg, TRI);
+ unsigned RegN = sgprPairNumber(OpReg, TRI);
if (!VALUReadHazardSGPRs[RegN])
continue;
@@ -3101,7 +3103,7 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (IsSetPC && I.getNumDefs() > 0)
return true;
// Check for any register writes.
- return llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
+ return any_of(SGPRsUsed, [this, &I](Register Reg) {
return I.modifiesRegister(Reg, &TRI);
});
};
@@ -3122,9 +3124,8 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (!SIInstrInfo::isSALU(I) || SIInstrInfo::isSOPP(I))
return 0;
// SALU must be unrelated to any hazard registers.
- if (llvm::any_of(SGPRsUsed, [this, &I](Register Reg) {
- return I.readsRegister(Reg, &TRI);
- }))
+ if (any_of(SGPRsUsed,
+ [this, &I](Register Reg) { return I.readsRegister(Reg, &TRI); }))
return 0;
return 1;
};
@@ -3146,14 +3147,14 @@ bool GCNHazardRecognizer::fixVALUReadSGPRHazard(MachineInstr *MI) {
if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
return Register(AMDGPU::VCC);
// TODO: handle TTMP?
- return Register(AMDGPU::SGPR0_SGPR1 + baseSGPRNumber(Reg, TRI));
+ return Register(AMDGPU::SGPR0_SGPR1 + sgprPairNumber(Reg, TRI));
};
auto SearchHazardFn = [this, hazardPair,
&SGPRsUsed](const MachineInstr &I) {
if (!SIInstrInfo::isVALU(I))
return false;
// Check for any register reads.
- return llvm::any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
+ return any_of(SGPRsUsed, [this, hazardPair, &I](Register Reg) {
return I.readsRegister(hazardPair(Reg), &TRI);
});
};
diff --git a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
index 276ccf0ae9869..b0de09c9f0fc6 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
-# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
+# RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O0 %s
+# RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN-O2 %s
--- |
@mem = internal unnamed_addr addrspace(4) constant [4 x <4 x i32>] [<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>]
More information about the llvm-commits
mailing list