[llvm] [AMDGPU] Rework GFX11 VALU Mask Write Hazard (PR #138663)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Thu May 8 06:17:00 PDT 2025
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/138663
>From a9cec30cbba48c447d490272ccb8faf39ed43063 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 6 May 2025 11:51:47 +0900
Subject: [PATCH 1/4] [AMDGPU] Rework GFX11 VALU Mask Write Hazard
Apply additional counter waits to address VALU writes to SGPRs.
Rework expiry detection and apply wait coalescing to mitigate some
of the additional waits.
---
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 224 +++++++---
.../atomic_optimizations_global_pointer.ll | 52 ++-
.../atomic_optimizations_local_pointer.ll | 135 ++++--
.../atomic_optimizations_pixelshader.ll | 1 +
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll | 2 +-
.../global-saddr-atomics-min-max-system.ll | 16 +
.../AMDGPU/global_atomics_scan_fmax.ll | 29 +-
.../AMDGPU/global_atomics_scan_fmin.ll | 29 +-
.../CodeGen/AMDGPU/nor-divergent-lanemask.ll | 3 +-
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 2 +
.../CodeGen/AMDGPU/valu-mask-write-hazard.mir | 388 +++++++++++++++++-
11 files changed, 731 insertions(+), 150 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index aaefe27b1324f..99a85a1f969fc 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2968,29 +2968,102 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
assert(!ST.hasExtendedWaitCounts());
- if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
+ if (!ST.isWave64())
+ return false;
+
+ const bool IsSALU = SIInstrInfo::isSALU(*MI);
+ const bool IsVALU = SIInstrInfo::isVALU(*MI);
+ if (!IsSALU && !IsVALU)
return false;
// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
- // 2. SALU writes SGPR
- // 3. SALU reads SGPR
+ // 2. VALU/SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
// The hazard can expire if the distance between 2 and 3 is sufficient.
// In practice this happens <10% of the time, hence this always assumes
// the hazard exists if 1 and 2 are present to avoid searching.
- const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
- if (!SDSTOp || !SDSTOp->isReg())
- return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
- const Register HazardReg = SDSTOp->getReg();
- if (HazardReg == AMDGPU::EXEC ||
- HazardReg == AMDGPU::EXEC_LO ||
- HazardReg == AMDGPU::EXEC_HI ||
- HazardReg == AMDGPU::M0)
+ auto IgnoreableSGPR = [](const Register Reg) {
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
+ case AMDGPU::SCC:
+ return true;
+ default:
+ return false;
+ }
+ };
+ auto IsVCC = [](const Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+ };
+
+ struct StateType {
+ SmallSet<Register, 2> HazardSGPRs;
+ };
+
+ SmallVector<const MachineInstr *> WaitInstrs;
+ bool HasSGPRRead = false;
+ StateType InitialState;
+
+ // Look for SGPR write.
+ MachineOperand *HazardDef = nullptr;
+ for (MachineOperand &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef() && HazardDef)
+ continue;
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
+ continue;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ // Also check for SGPR reads.
+ if (Op.isUse()) {
+ HasSGPRRead = true;
+ continue;
+ }
+
+ assert(!HazardDef);
+ HazardDef = &Op;
+ }
+
+ if (!HazardDef)
return false;
- auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+ const Register HazardReg = HazardDef->getReg();
+ auto *HazardRegRC = TRI->getPhysRegBaseClass(HazardReg);
+ bool IsSGPR32 = (HazardRegRC == TRI->getSGPRClassForBitWidth(32)) ||
+ HazardReg == AMDGPU::VCC_LO || HazardReg == AMDGPU::VCC_HI;
+
+ // Setup to track writes to individual SGPRs
+ if (IsSGPR32) {
+ InitialState.HazardSGPRs.insert(HazardReg);
+ } else if (IsVCC(HazardReg)) {
+ InitialState.HazardSGPRs.insert(AMDGPU::VCC_LO);
+ InitialState.HazardSGPRs.insert(AMDGPU::VCC_HI);
+ } else {
+ assert(HazardRegRC && HazardRegRC == TRI->getSGPRClassForBitWidth(64));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
+ }
+
+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
+ if (State.HazardSGPRs.empty())
+ return HazardExpired;
+
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
@@ -3005,11 +3078,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
- case AMDGPU::V_SUBBREV_U32_dpp:
+ case AMDGPU::V_SUBBREV_U32_dpp: {
// These implicitly read VCC as mask source.
- return HazardReg == AMDGPU::VCC ||
- HazardReg == AMDGPU::VCC_LO ||
- HazardReg == AMDGPU::VCC_HI;
+ return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
+ }
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3025,68 +3097,102 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
- return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+ bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
+ return Result ? HazardFound : NoHazardFound;
}
default:
- return false;
+ return NoHazardFound;
}
};
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
- if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
- return true;
-
- // VALU access to any SGPR or literal constant other than HazardReg
- // mitigates hazard. No need to check HazardReg here as this will
- // only be called when !IsHazardFn.
- if (!SIInstrInfo::isVALU(I))
- return false;
- for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
- const MachineOperand &Op = I.getOperand(OpNo);
- if (Op.isReg()) {
- Register OpReg = Op.getReg();
- // Only consider uses
- if (!Op.isUse())
+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
+ switch (I.getOpcode()) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ // Record waits within region of instructions free of SGPR reads.
+ if (!HasSGPRRead && I.getParent() == MI->getParent())
+ WaitInstrs.push_back(&I);
+ break;
+ default:
+ // Update tracking of SGPR reads and writes.
+ for (auto &Op : I.operands()) {
+ if (!Op.isReg())
continue;
- // Ignore EXEC
- if (OpReg == AMDGPU::EXEC ||
- OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::EXEC_HI)
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
continue;
- // Ignore all implicit uses except VCC
- if (Op.isImplicit()) {
- if (OpReg == AMDGPU::VCC ||
- OpReg == AMDGPU::VCC_LO ||
- OpReg == AMDGPU::VCC_HI)
- return true;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ if (Op.isUse()) {
+ HasSGPRRead = true;
continue;
}
- if (TRI.isSGPRReg(MRI, OpReg))
- return true;
- } else {
- const MCInstrDesc &InstDesc = I.getDesc();
- const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
- if (!TII.isInlineConstant(Op, OpInfo))
- return true;
+
+ // Stop tracking any SGPRs with writes on the basis that they will
+ // already have an appropriate wait inserted afterwards.
+ SmallVector<Register, 2> Found;
+ for (Register SGPR : State.HazardSGPRs) {
+ if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
+ Found.push_back(SGPR);
+ }
+ for (Register SGPR : Found)
+ State.HazardSGPRs.erase(SGPR);
}
+ break;
}
- return false;
};
// Check for hazard
- if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
- std::numeric_limits<int>::max())
+ DenseSet<const MachineBasicBlock *> Visited;
+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
+ MI->getParent(),
+ std::next(MI->getReverseIterator()), Visited))
return false;
- auto NextMI = std::next(MI->getIterator());
+ // Compute counter mask
+ unsigned DepCtr =
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0);
+
+ // Try to merge previous waits into this one for regions with no SGPR reads.
+ if (WaitInstrs.size()) {
+ const unsigned ConstantBits = AMDGPU::DepCtr::encodeFieldSaSdst(
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0),
+ 0),
+ 0);
+
+ for (const MachineInstr *Instr : WaitInstrs) {
+ // Don't touch bundled waits.
+ if (Instr->isBundled())
+ continue;
+ MachineInstr *WaitMI = const_cast<MachineInstr *>(Instr);
+ unsigned WaitMask = WaitMI->getOperand(0).getImm();
+ // Only work with counters related to this hazard.
+ if ((WaitMask & ConstantBits) != ConstantBits)
+ continue;
+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
+ WaitMI->eraseFromParent();
+ }
+ }
- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+ // Add s_waitcnt_depctr after SGPR write.
+ auto NextMI = std::next(MI->getIterator());
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(DepCtr);
// SALU write may be s_getpc in a bundle.
updateGetPCBundle(NewMI);
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index f979d01e495ba..3cfc7b0751bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -3165,38 +3165,39 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -3250,6 +3251,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
@@ -6729,38 +6731,39 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -6814,6 +6817,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
@@ -8818,7 +8822,7 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS-NEXT: ; implicit-def: $vgpr0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB15_2
; GFX7LESS-NEXT: ; %bb.1:
@@ -9328,7 +9332,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX7LESS-NEXT: ; implicit-def: $vgpr0
+; GFX7LESS-NEXT: ; implicit-def: $vgpr0
; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4
; GFX7LESS-NEXT: ; %bb.1:
@@ -11377,6 +11381,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11433,6 +11438,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12411,19 +12417,20 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12472,7 +12479,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xf1ff
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -12482,6 +12489,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 9775a37276dfd..d1c2aab84317d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -963,6 +963,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -2659,43 +2660,45 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2720,6 +2723,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -3314,34 +3318,35 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0
; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffc
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -4370,6 +4375,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -6091,43 +6097,45 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6152,6 +6160,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -6779,6 +6788,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -7503,6 +7513,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -8140,6 +8151,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -8863,6 +8875,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -9500,6 +9513,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -10223,6 +10237,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -10860,6 +10875,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -11139,6 +11155,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -11974,16 +11991,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1
@@ -11991,8 +12011,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -12002,6 +12023,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1
@@ -12016,14 +12038,17 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12048,6 +12073,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -12688,6 +12714,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -12967,6 +12994,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -13802,16 +13830,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2
@@ -13819,8 +13850,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -13830,6 +13862,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2
@@ -13844,14 +13877,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -13876,6 +13912,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -14516,6 +14553,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -14791,6 +14829,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -15623,16 +15662,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -15640,8 +15682,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -15651,6 +15694,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -15665,14 +15709,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15697,6 +15744,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -16331,6 +16379,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -16607,6 +16656,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -17437,16 +17487,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
@@ -17454,8 +17507,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -17465,6 +17519,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
@@ -17479,14 +17534,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17511,6 +17569,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 0c624a83ae1be..f80cb6cbcfd58 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -515,6 +515,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-NEXT: v_readlane_b32 s12, v1, 63
; GFX1164-NEXT: v_readlane_b32 s14, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s13, 32
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index b03ade4b527e6..5a5b838e4110f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1387,7 +1387,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT: s_waitcnt_depctr 0xfffe
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s0, s2, s0
; GFX11-NEXT: s_addc_u32 s1, s3, s1
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
index b7ee9f70f6014..65832f8db2b00 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
@@ -519,6 +519,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -649,6 +650,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -771,6 +773,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -885,6 +888,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -1443,6 +1447,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -1573,6 +1578,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -1695,6 +1701,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -1809,6 +1816,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -2367,6 +2375,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -2497,6 +2506,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -2619,6 +2629,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -2733,6 +2744,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -3291,6 +3303,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -3421,6 +3434,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -3543,6 +3557,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -3657,6 +3672,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f8f911b693e09..7e80279057e9d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -969,14 +969,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -2008,14 +2008,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3047,14 +3047,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5229,12 +5229,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -6549,10 +6549,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -8775,12 +8776,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 1f76a476107a3..3af1b5e207e0b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -969,14 +969,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -2008,14 +2008,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3047,14 +3047,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5229,12 +5229,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -6549,10 +6549,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -8775,12 +8776,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
index 420539346b400..97c2bdceb31a8 100644
--- a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
@@ -13,6 +13,7 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff
; SDAG-W64-NEXT: ; return to shader part epilog
;
; GISEL-W64-LABEL: test_nor:
@@ -57,8 +58,8 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff
; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc
-; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
; SDAG-W64-NEXT: ; return to shader part epilog
;
; GISEL-W64-LABEL: test_or_two_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 6fc92bce8242e..60faed95032ce 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1239,6 +1239,7 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB11_6
; GFX11-NEXT: ; %bb.1: ; %bb
@@ -2066,6 +2067,7 @@ define amdgpu_ps void @scc_use_after_kill_inst(float inreg %x, i32 inreg %y) #0
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB17_6
; GFX11-NEXT: ; %bb.1: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 1eabe62e7710e..e1d3ebc2d35d1 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -38,6 +38,22 @@
define amdgpu_gs void @mask_hazard_no_hazard1() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard2() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard3() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard1() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard2() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard3() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard4() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel1() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel2() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel3() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel4() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane1() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane2() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane3() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readfirstlane() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_vcmp_vcc() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_vcmp_sgpr() { ret void }
+ define amdgpu_gs void @mask_hazard_combine1() { ret void }
+ define amdgpu_gs void @mask_hazard_combine2() { ret void }
...
---
@@ -487,8 +503,8 @@ body: |
; GFX11-LABEL: name: mask_hazard_subreg3
; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GFX11-NEXT: $sgpr2 = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
; GFX11-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: mask_hazard_subreg3
@@ -655,3 +671,373 @@ body: |
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
S_ENDPGM 0
...
+
+---
+name: mask_hazard_cancel_hazard1
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: $vcc = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_MOV_B32 0
+ $vcc_hi = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard2
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: $vcc = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc = S_MOV_B64 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard3
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard3
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0_sgpr1 = S_MOV_B64 0
+ $sgpr4 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard4
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard4
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr4 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel1
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX11-NEXT: $vcc = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel1
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX12-NEXT: $vcc = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel2
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX11-NEXT: $vcc = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel2
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX12-NEXT: $vcc = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_hi = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel3
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel3
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr0
+ ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel3
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr0
+ ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel4
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel4
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr1
+ ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel4
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr1
+ ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr1 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 $sgpr1
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane1
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane1
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane2
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane2
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane3
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane3
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane3
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readfirstlane
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readfirstlane
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readfirstlane
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_vcmp_vcc
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_vcmp_vcc
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65533
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_vcmp_vcc
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_vcmp_sgpr
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_vcmp_sgpr
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_vcmp_sgpr
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_combine1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_combine1
+ ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61948
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_combine1
+ ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_combine2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_combine2
+ ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65532
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 $sgpr4
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_combine2
+ ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 $sgpr4
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 $sgpr4
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
>From 46f998ebaef04bdbd394d98f319a02fbe656795b Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 7 May 2025 20:26:47 +0900
Subject: [PATCH 2/4] - Use RegClass.contains()
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 99a85a1f969fc..f11d1f28832d8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3044,9 +3044,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
const Register HazardReg = HazardDef->getReg();
- auto *HazardRegRC = TRI->getPhysRegBaseClass(HazardReg);
- bool IsSGPR32 = (HazardRegRC == TRI->getSGPRClassForBitWidth(32)) ||
- HazardReg == AMDGPU::VCC_LO || HazardReg == AMDGPU::VCC_HI;
+ bool IsSGPR32 = (AMDGPU::SReg_32RegClass.contains(HazardReg) ||
+ HazardReg == AMDGPU::VCC_LO || HazardReg == AMDGPU::VCC_HI);
// Setup to track writes to individual SGPRs
if (IsSGPR32) {
@@ -3055,7 +3054,7 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
InitialState.HazardSGPRs.insert(AMDGPU::VCC_LO);
InitialState.HazardSGPRs.insert(AMDGPU::VCC_HI);
} else {
- assert(HazardRegRC && HazardRegRC == TRI->getSGPRClassForBitWidth(64));
+ assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
}
>From 781217ee7f7a70ad1f5e76d3eec9b2f94eacb0eb Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 8 May 2025 22:04:44 +0900
Subject: [PATCH 3/4] - Simplify SGPR32/SGPR64 determination.
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index f11d1f28832d8..70a59505772e8 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3043,12 +3043,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
if (!HazardDef)
return false;
- const Register HazardReg = HazardDef->getReg();
- bool IsSGPR32 = (AMDGPU::SReg_32RegClass.contains(HazardReg) ||
- HazardReg == AMDGPU::VCC_LO || HazardReg == AMDGPU::VCC_HI);
-
// Setup to track writes to individual SGPRs
- if (IsSGPR32) {
+ const Register HazardReg = HazardDef->getReg();
+ if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
InitialState.HazardSGPRs.insert(HazardReg);
} else if (IsVCC(HazardReg)) {
InitialState.HazardSGPRs.insert(AMDGPU::VCC_LO);
>From 1b747187f42b946b7d90cad702744a13da13308e Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 8 May 2025 22:16:40 +0900
Subject: [PATCH 4/4] - Improve comment accuracy
---
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 70a59505772e8..da529e0b3c681 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2980,9 +2980,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// 1. VALU reads SGPR as mask
// 2. VALU/SALU writes SGPR
// 3. VALU/SALU reads SGPR
- // The hazard can expire if the distance between 2 and 3 is sufficient.
- // In practice this happens <10% of the time, hence this always assumes
- // the hazard exists if 1 and 2 are present to avoid searching.
+ // The hazard can expire if the distance between 2 and 3 is sufficient,
+ // or (2) is VALU and (3) is SALU.
+ // In practice this happens <10% of the time, hence always assume the hazard
+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
More information about the llvm-commits
mailing list