[llvm] 385c121 - [AMDGPU] Rework GFX11 VALU Mask Write Hazard (#138663)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 28 00:09:32 PDT 2025
Author: Carl Ritson
Date: 2025-10-28T16:09:28+09:00
New Revision: 385c12134aa6b3215d92ce6034d99fd7aec45dd7
URL: https://github.com/llvm/llvm-project/commit/385c12134aa6b3215d92ce6034d99fd7aec45dd7
DIFF: https://github.com/llvm/llvm-project/commit/385c12134aa6b3215d92ce6034d99fd7aec45dd7.diff
LOG: [AMDGPU] Rework GFX11 VALU Mask Write Hazard (#138663)
Apply additional counter waits to address VALU writes to SGPRs. Rework
expiry detection and apply wait coalescing to mitigate some of the
additional waits.
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a911e7e6c01a4..52cc4ca5a955c 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3267,29 +3267,103 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
return false;
assert(!ST.hasExtendedWaitCounts());
- if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
+ if (!ST.isWave64())
+ return false;
+
+ const bool IsSALU = SIInstrInfo::isSALU(*MI);
+ const bool IsVALU = SIInstrInfo::isVALU(*MI);
+ if (!IsSALU && !IsVALU)
return false;
// The hazard sequence is three instructions:
// 1. VALU reads SGPR as mask
- // 2. SALU writes SGPR
- // 3. SALU reads SGPR
- // The hazard can expire if the distance between 2 and 3 is sufficient.
- // In practice this happens <10% of the time, hence this always assumes
- // the hazard exists if 1 and 2 are present to avoid searching.
+ // 2. VALU/SALU writes SGPR
+ // 3. VALU/SALU reads SGPR
+ // The hazard can expire if the distance between 2 and 3 is sufficient,
+ // or (2) is VALU and (3) is SALU.
+ // In practice this happens <10% of the time, hence always assume the hazard
+ // exists if (1) and (2) are present to avoid searching all SGPR reads.
- const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
- if (!SDSTOp || !SDSTOp->isReg())
- return false;
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ auto IgnoreableSGPR = [](const Register Reg) {
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::M0:
+ case AMDGPU::SGPR_NULL:
+ case AMDGPU::SGPR_NULL64:
+ case AMDGPU::SCC:
+ return true;
+ default:
+ return false;
+ }
+ };
+ auto IsVCC = [](const Register Reg) {
+ return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
+ };
+
+ struct StateType {
+ SmallSet<Register, 2> HazardSGPRs;
+
+ static unsigned getHashValue(const StateType &State) {
+ return hash_combine_range(State.HazardSGPRs);
+ }
+ static bool isEqual(const StateType &LHS, const StateType &RHS) {
+ return LHS.HazardSGPRs == RHS.HazardSGPRs;
+ }
+ };
+
+ SmallVector<const MachineInstr *> WaitInstrs;
+ bool HasSGPRRead = false;
+ StateType InitialState;
+
+ // Look for SGPR write.
+ MachineOperand *HazardDef = nullptr;
+ for (MachineOperand &Op : MI->operands()) {
+ if (!Op.isReg())
+ continue;
+ if (Op.isDef() && HazardDef)
+ continue;
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
+ continue;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ // Also check for SGPR reads.
+ if (Op.isUse()) {
+ HasSGPRRead = true;
+ continue;
+ }
+
+ assert(!HazardDef);
+ HazardDef = &Op;
+ }
- const Register HazardReg = SDSTOp->getReg();
- if (HazardReg == AMDGPU::EXEC ||
- HazardReg == AMDGPU::EXEC_LO ||
- HazardReg == AMDGPU::EXEC_HI ||
- HazardReg == AMDGPU::M0)
+ if (!HazardDef)
return false;
- auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
+ // Setup to track writes to individual SGPRs
+ const Register HazardReg = HazardDef->getReg();
+ if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {
+ InitialState.HazardSGPRs.insert(HazardReg);
+ } else {
+ assert(AMDGPU::SReg_64RegClass.contains(HazardReg));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
+ InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
+ }
+
+ auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {
+ if (State.HazardSGPRs.empty())
+ return HazardExpired;
+
switch (I.getOpcode()) {
case AMDGPU::V_ADDC_U32_e32:
case AMDGPU::V_ADDC_U32_dpp:
@@ -3304,11 +3378,10 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
case AMDGPU::V_SUBB_U32_e32:
case AMDGPU::V_SUBB_U32_dpp:
case AMDGPU::V_SUBBREV_U32_e32:
- case AMDGPU::V_SUBBREV_U32_dpp:
+ case AMDGPU::V_SUBBREV_U32_dpp: {
// These implicitly read VCC as mask source.
- return HazardReg == AMDGPU::VCC ||
- HazardReg == AMDGPU::VCC_LO ||
- HazardReg == AMDGPU::VCC_HI;
+ return IsVCC(HazardReg) ? HazardFound : NoHazardFound;
+ }
case AMDGPU::V_ADDC_U32_e64:
case AMDGPU::V_ADDC_U32_e64_dpp:
case AMDGPU::V_CNDMASK_B16_t16_e64:
@@ -3324,68 +3397,109 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Only check mask register overlaps.
const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
assert(SSRCOp);
- return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
+ bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);
+ return Result ? HazardFound : NoHazardFound;
}
default:
- return false;
+ return NoHazardFound;
}
};
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
- // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
- if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
- return true;
-
- // VALU access to any SGPR or literal constant other than HazardReg
- // mitigates hazard. No need to check HazardReg here as this will
- // only be called when !IsHazardFn.
- if (!SIInstrInfo::isVALU(I))
- return false;
- for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
- const MachineOperand &Op = I.getOperand(OpNo);
- if (Op.isReg()) {
- Register OpReg = Op.getReg();
- // Only consider uses
- if (!Op.isUse())
+ const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
+ 0);
+ auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
+ switch (I.getOpcode()) {
+ case AMDGPU::S_WAITCNT_DEPCTR:
+ // Record mergable waits within region of instructions free of SGPR reads.
+ if (!HasSGPRRead && I.getParent() == MI->getParent() && !I.isBundled() &&
+ (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
+ WaitInstrs.push_back(&I);
+ break;
+ default:
+ // Update tracking of SGPR reads and writes.
+ for (auto &Op : I.operands()) {
+ if (!Op.isReg())
continue;
- // Ignore EXEC
- if (OpReg == AMDGPU::EXEC ||
- OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::EXEC_HI)
+
+ Register Reg = Op.getReg();
+ if (IgnoreableSGPR(Reg))
continue;
- // Ignore all implicit uses except VCC
- if (Op.isImplicit()) {
- if (OpReg == AMDGPU::VCC ||
- OpReg == AMDGPU::VCC_LO ||
- OpReg == AMDGPU::VCC_HI)
- return true;
+ if (!IsVCC(Reg)) {
+ if (Op.isImplicit())
+ continue;
+ if (!TRI->isSGPRReg(MRI, Reg))
+ continue;
+ }
+ if (Op.isUse()) {
+ HasSGPRRead = true;
continue;
}
- if (TRI.isSGPRReg(MRI, OpReg))
- return true;
- } else {
- const MCInstrDesc &InstDesc = I.getDesc();
- const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
- if (!TII.isInlineConstant(Op, OpInfo))
- return true;
+
+ // Stop tracking any SGPRs with writes on the basis that they will
+ // already have an appropriate wait inserted afterwards.
+ SmallVector<Register, 2> Found;
+ for (Register SGPR : State.HazardSGPRs) {
+ if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))
+ Found.push_back(SGPR);
+ }
+ for (Register SGPR : Found)
+ State.HazardSGPRs.erase(SGPR);
}
+ break;
}
- return false;
};
// Check for hazard
- if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
- std::numeric_limits<int>::max())
+ if (!hasHazard<StateType>(InitialState, IsHazardFn, UpdateStateFn,
+ MI->getParent(),
+ std::next(MI->getReverseIterator())))
return false;
- auto NextMI = std::next(MI->getIterator());
+ // Compute counter mask
+ unsigned DepCtr =
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0);
+
+ // Try to merge previous waits into this one for regions with no SGPR reads.
+ if (!WaitInstrs.empty()) {
+ // Note: WaitInstrs contains const pointers, so walk backward from MI to
+ // obtain a mutable pointer to each instruction to be merged.
+ // This is expected to be a very short walk within the same block.
+ SmallVector<MachineInstr *> ToErase;
+ unsigned Found = 0;
+ for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
+ End = MI->getParent()->rend();
+ Found < WaitInstrs.size() && It != End; ++It) {
+ MachineInstr *WaitMI = &*It;
+ // Find next wait instruction.
+ if (std::as_const(WaitMI) != WaitInstrs[Found])
+ continue;
+ Found++;
+ unsigned WaitMask = WaitMI->getOperand(0).getImm();
+ assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
+ DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
+ DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
+ DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
+ AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
+ ToErase.push_back(WaitMI);
+ }
+ assert(Found == WaitInstrs.size());
+ for (MachineInstr *WaitMI : ToErase)
+ WaitMI->eraseFromParent();
+ }
- // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
+ // Add s_waitcnt_depctr after SGPR write.
+ auto NextMI = std::next(MI->getIterator());
auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(DepCtr);
// SALU write may be s_getpc in a bundle.
updateGetPCBundle(NewMI);
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 9db6d706b634b..6a95881067b93 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -3154,17 +3154,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3172,6 +3174,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -3184,8 +3187,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -3237,8 +3241,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, s6
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
@@ -5856,6 +5861,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[10:11]
@@ -6384,6 +6390,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: s_or_b64 s[12:13], vcc, s[12:13]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[12:13]
@@ -6983,6 +6990,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: buffer_gl1_inv
; GFX1164_ITERATIVE-NEXT: buffer_gl0_inv
; GFX1164_ITERATIVE-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX1164_ITERATIVE-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_ITERATIVE-NEXT: s_or_b64 s[12:13], vcc, s[12:13]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 exec, exec, s[12:13]
@@ -7669,17 +7677,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7687,6 +7697,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -7699,8 +7710,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -7748,8 +7760,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v7
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v6
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, v10, s8
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_subrev_co_ci_u32_e64 v9, null, s9, v11, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v8
@@ -7761,6 +7774,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: buffer_gl1_inv
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[10:11]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_or_b64 s[12:13], vcc, s[12:13]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: s_and_not1_b64 exec, exec, s[12:13]
@@ -7775,8 +7789,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v5
; GFX1164_DPP-NEXT: v_readfirstlane_b32 s3, v7
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e64 v7, null, s3, v9, vcc
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
@@ -12770,6 +12785,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12826,6 +12842,7 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13804,9 +13821,10 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX1164-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1164-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164-TRUE16-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
@@ -13815,6 +13833,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-TRUE16-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13863,6 +13882,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
; GFX1164-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
; GFX1164-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xf1ff
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
@@ -13873,6 +13893,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2
; GFX1164-FAKE16-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6167a84094b7a..08a4f0cdad18f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -957,6 +957,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -2640,17 +2641,19 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -2658,6 +2661,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -2670,13 +2674,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2701,6 +2707,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -3288,23 +3295,26 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v2, 0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0
; GFX1164_DPP-NEXT: v_permlane64_b32 v3, v2
@@ -3312,10 +3322,11 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_permlane64_b32 v4, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffc
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v4, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -4337,6 +4348,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -6043,17 +6055,19 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v2, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6061,6 +6075,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v4, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v5, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v1, -1, -1
@@ -6073,13 +6088,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s2
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6104,6 +6121,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -6724,6 +6742,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -7441,6 +7460,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -8071,6 +8091,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -8787,6 +8808,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -9417,6 +9439,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -10133,6 +10156,7 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s3, v2, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
@@ -10763,6 +10787,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -11041,6 +11066,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -11870,16 +11896,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, 1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1
@@ -11887,8 +11916,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -11898,6 +11928,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, 1
@@ -11912,14 +11943,17 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, 1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -11944,6 +11978,7 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -12577,6 +12612,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -12855,6 +12891,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -13684,16 +13721,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v6, -2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2
@@ -13701,8 +13741,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -13712,6 +13753,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v4, -2
@@ -13726,14 +13768,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v5, -2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -13758,6 +13803,7 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -14391,6 +14437,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -14665,6 +14712,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -15489,16 +15537,19 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -15506,8 +15557,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -15517,6 +15569,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
@@ -15531,14 +15584,17 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15563,6 +15619,7 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
@@ -16190,6 +16247,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v1, 47
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s3, 32
@@ -16465,6 +16523,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -17287,16 +17346,19 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
@@ -17304,8 +17366,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v2, -1, -1
@@ -17315,6 +17378,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
@@ -17329,14 +17393,17 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v5 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[3:4]
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, -1
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s2, v2, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17361,6 +17428,7 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v4, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffd
; GFX1164_DPP-NEXT: s_mov_b32 s2, -1
; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[6:7], vcc
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 0f593045007fa..62f09deeb9b08 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -515,6 +515,7 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX1164-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-NEXT: v_readlane_b32 s12, v1, 63
; GFX1164-NEXT: v_readlane_b32 s14, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s13, 32
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 5959f76492f3c..f6149cf9caf7c 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1387,7 +1387,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
-; GFX11-NEXT: s_waitcnt_depctr 0xfffe
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s0, s2, s0
; GFX11-NEXT: s_addc_u32 s1, s3, s1
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
index b7ee9f70f6014..65832f8db2b00 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll
@@ -519,6 +519,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -649,6 +650,7 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) i
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -771,6 +773,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -885,6 +888,7 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -1443,6 +1447,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -1573,6 +1578,7 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) i
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -1695,6 +1701,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -1809,6 +1816,7 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -2367,6 +2375,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -2497,6 +2506,7 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -2619,6 +2629,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -2733,6 +2744,7 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -3291,6 +3303,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -3421,6 +3434,7 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
@@ -3543,6 +3557,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
@@ -3657,6 +3672,7 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v3
; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 4581efc06504a..ff4b658606390 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -963,14 +963,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -1996,14 +1996,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3029,14 +3029,14 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -4248,10 +4248,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -5511,10 +5512,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6774,10 +6776,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index bd570d9eccdc3..15d4b9a84f3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -963,14 +963,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -1996,14 +1996,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3029,14 +3029,14 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -4248,10 +4248,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -5511,10 +5512,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6774,10 +6776,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
index 0a2e7afa3d417..8103140f2a587 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -1185,6 +1185,7 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 {
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0x3e800000, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB21_1
; GFX11-NEXT: s_endpgm
@@ -1595,6 +1596,7 @@ define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1,
; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xf1ff
; GFX11-NEXT: .LBB26_2: ; %bb
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_add_f32_e32 v0, 0x3e800000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
index 420539346b400..97c2bdceb31a8 100644
--- a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll
@@ -13,6 +13,7 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) {
; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff
; SDAG-W64-NEXT: ; return to shader part epilog
;
; GISEL-W64-LABEL: test_nor:
@@ -57,8 +58,8 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) {
; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
+; SDAG-W64-NEXT: s_waitcnt_depctr 0xf1ff
; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc
-; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe
; SDAG-W64-NEXT: ; return to shader part epilog
;
; GISEL-W64-LABEL: test_or_two_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index b21c781f6223a..5461532184fc5 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1239,6 +1239,7 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x, i32 inreg %y) #0
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB11_6
; GFX11-NEXT: ; %bb.1: ; %bb
@@ -2066,6 +2067,7 @@ define amdgpu_ps void @scc_use_after_kill_inst(float inreg %x, i32 inreg %y) #0
; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfffd
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB17_6
; GFX11-NEXT: ; %bb.1: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 1eabe62e7710e..e1d3ebc2d35d1 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -38,6 +38,22 @@
define amdgpu_gs void @mask_hazard_no_hazard1() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard2() { ret void }
define amdgpu_gs void @mask_hazard_no_hazard3() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard1() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard2() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard3() { ret void }
+ define amdgpu_gs void @mask_hazard_cancel_hazard4() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel1() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel2() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel3() { ret void }
+ define amdgpu_gs void @mask_hazard_partial_cancel4() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane1() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane2() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readlane3() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_readfirstlane() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_vcmp_vcc() { ret void }
+ define amdgpu_gs void @mask_hazard_valu_vcmp_sgpr() { ret void }
+ define amdgpu_gs void @mask_hazard_combine1() { ret void }
+ define amdgpu_gs void @mask_hazard_combine2() { ret void }
...
---
@@ -487,8 +503,8 @@ body: |
; GFX11-LABEL: name: mask_hazard_subreg3
; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
; GFX11-NEXT: $sgpr2 = S_MOV_B32 0
- ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
; GFX11-NEXT: S_ENDPGM 0
;
; GFX12-LABEL: name: mask_hazard_subreg3
@@ -655,3 +671,373 @@ body: |
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
S_ENDPGM 0
...
+
+---
+name: mask_hazard_cancel_hazard1
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard1
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GCN-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: $vcc = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_MOV_B32 0
+ $vcc_hi = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard2
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard2
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GCN-NEXT: $vcc = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GCN-NEXT: $vcc = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc = S_MOV_B64 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard3
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard3
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0_sgpr1 = S_MOV_B64 0
+ $sgpr4 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_cancel_hazard4
+body: |
+ bb.0:
+ ; GCN-LABEL: name: mask_hazard_cancel_hazard4
+ ; GCN: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GCN-NEXT: $sgpr4 = S_MOV_B32 $sgpr0
+ ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GCN-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr4 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel1
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX11-NEXT: $vcc = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel1
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX12-NEXT: $vcc = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_lo = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel2
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX11-NEXT: $vcc = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel2
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 $vcc_lo
+ ; GFX12-NEXT: $vcc = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vcc_hi = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 $vcc_lo
+ $vcc = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel3
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel3
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr0
+ ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel3
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr0
+ ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 $sgpr0
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_partial_cancel4
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_partial_cancel4
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: $sgpr3 = S_MOV_B32 $sgpr1
+ ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_partial_cancel4
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX12-NEXT: S_WAITCNT_DEPCTR 65534
+ ; GFX12-NEXT: $sgpr3 = S_MOV_B32 $sgpr1
+ ; GFX12-NEXT: $sgpr0_sgpr1 = S_MOV_B64 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $sgpr1 = S_MOV_B32 0
+ $sgpr3 = S_MOV_B32 $sgpr1
+ $sgpr0_sgpr1 = S_MOV_B64 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane1
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane1
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane2
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane2
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readlane3
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readlane3
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX11-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readlane3
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ ; GFX12-NEXT: $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READLANE_B32 $vgpr3, 0
+ $sgpr3 = V_READLANE_B32 $vgpr3, 1
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_readfirstlane
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_readfirstlane
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_readfirstlane
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2 = V_READFIRSTLANE_B32 $vgpr3, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_vcmp_vcc
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_vcmp_vcc
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65533
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_vcmp_vcc
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_valu_vcmp_sgpr
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_valu_vcmp_sgpr
+ ; GFX11: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61951
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_valu_vcmp_sgpr
+ ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_combine1
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_combine1
+ ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61948
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_combine1
+ ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 0
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mask_hazard_combine2
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: mask_hazard_combine2
+ ; GFX11: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX11-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX11-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX11-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX11-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 65532
+ ; GFX11-NEXT: $sgpr1 = S_MOV_B32 $sgpr4
+ ; GFX11-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX11-NEXT: S_WAITCNT_DEPCTR 61950
+ ; GFX11-NEXT: S_ENDPGM 0
+ ;
+ ; GFX12-LABEL: name: mask_hazard_combine2
+ ; GFX12: $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ ; GFX12-NEXT: $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ ; GFX12-NEXT: $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ ; GFX12-NEXT: V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
+ ; GFX12-NEXT: $sgpr1 = S_MOV_B32 $sgpr4
+ ; GFX12-NEXT: $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0
+ $vgpr3 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
+ $vgpr4 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
+ $vgpr5 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
+ V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr1 = S_MOV_B32 $sgpr4
+ $sgpr2_sgpr3 = V_CMP_EQ_U32_e64 3, $vgpr5, implicit $exec
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list