[llvm] cfb7ffd - [AMDGPU] New AMDGPUInsertDelayAlu pass
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 29 13:33:02 PDT 2022
Author: Jay Foad
Date: 2022-06-29T21:30:20+01:00
New Revision: cfb7ffdec0ebafc46b0e37f636c11d8ac74cd39c
URL: https://github.com/llvm/llvm-project/commit/cfb7ffdec0ebafc46b0e37f636c11d8ac74cd39c
DIFF: https://github.com/llvm/llvm-project/commit/cfb7ffdec0ebafc46b0e37f636c11d8ac74cd39c.diff
LOG: [AMDGPU] New AMDGPUInsertDelayAlu pass
Differential Revision: https://reviews.llvm.org/D128270
Added:
llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
llvm/test/CodeGen/AMDGPU/clamp.ll
llvm/test/CodeGen/AMDGPU/cluster_stores.ll
llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
llvm/test/CodeGen/AMDGPU/mad_64_32.ll
llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 33f59ad60b3eb..41ab0eba8b125 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -299,6 +299,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
+void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
+extern char &AMDGPUInsertDelayAluID;
+
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
new file mode 100644
index 0000000000000..c9cdbc89f3a41
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -0,0 +1,457 @@
+//===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_delay_alu instructions to avoid stalls on GFX11+.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SetVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-delay-alu"
+
+namespace {
+
+class AMDGPUInsertDelayAlu : public MachineFunctionPass {
+public:
+ static char ID;
+
+ const SIInstrInfo *SII;
+ const TargetRegisterInfo *TRI;
+
+ TargetSchedModel SchedModel;
+
+ AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ // Return true if MI waits for all outstanding VALU instructions to complete.
+ static bool instructionWaitsForVALU(const MachineInstr &MI) {
+ // These instruction types wait for VA_VDST==0 before issuing.
+ const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
+ SIInstrFlags::FLAT | SIInstrFlags::MIMG |
+ SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
+ if (MI.getDesc().TSFlags & VA_VDST_0)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
+ MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
+ return true;
+ if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
+ (MI.getOperand(0).getImm() & 0xf000) == 0)
+ return true;
+ return false;
+ }
+
+ // Types of delay that can be encoded in an s_delay_alu instruction.
+ enum DelayType { VALU, TRANS, SALU, OTHER };
+
+ // Get the delay type for an instruction with the specified TSFlags.
+ static DelayType getDelayType(uint64_t TSFlags) {
+ if (TSFlags & SIInstrFlags::TRANS)
+ return TRANS;
+ if (TSFlags & SIInstrFlags::VALU)
+ return VALU;
+ if (TSFlags & SIInstrFlags::SALU)
+ return SALU;
+ return OTHER;
+ }
+
+ // Information about the last instruction(s) that wrote to a particular
+ // regunit. In straight-line code there will only be one such instruction, but
+ // when control flow converges we merge the delay information from each path
+ // to represent the union of the worst-case delays of each type.
+ struct DelayInfo {
+ // One larger than the maximum number of (non-TRANS) VALU instructions we
+ // can encode in an s_delay_alu instruction.
+ static const unsigned VALU_MAX = 5;
+
+ // One larger than the maximum number of TRANS instructions we can encode in
+ // an s_delay_alu instruction.
+ static const unsigned TRANS_MAX = 4;
+
+ // If it was written by a (non-TRANS) VALU, remember how many clock cycles
+ // are left until it completes, and how many other (non-TRANS) VALU we have
+ // seen since it was issued.
+ uint8_t VALUCycles = 0;
+ uint8_t VALUNum = VALU_MAX;
+
+ // If it was written by a TRANS, remember how many clock cycles are left
+ // until it completes, and how many other TRANS we have seen since it was
+ // issued.
+ uint8_t TRANSCycles = 0;
+ uint8_t TRANSNum = TRANS_MAX;
+ // Also remember how many other (non-TRANS) VALU we have seen since it was
+ // issued. When an instruction depends on both a prior TRANS and a prior
+ // non-TRANS VALU, this is used to decide whether to encode a wait for just
+ // one or both of them.
+ uint8_t TRANSNumVALU = VALU_MAX;
+
+ // If it was written by an SALU, remember how many clock cycles are left
+ // until it completes.
+ uint8_t SALUCycles = 0;
+
+ DelayInfo() = default;
+
+ DelayInfo(DelayType Type, unsigned Cycles) {
+ switch (Type) {
+ default:
+ llvm_unreachable("unexpected type");
+ case VALU:
+ VALUCycles = Cycles;
+ VALUNum = 0;
+ break;
+ case TRANS:
+ TRANSCycles = Cycles;
+ TRANSNum = 0;
+ TRANSNumVALU = 0;
+ break;
+ case SALU:
+ SALUCycles = Cycles;
+ break;
+ }
+ }
+
+ bool operator==(const DelayInfo &RHS) const {
+ return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
+ TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
+ TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
+ }
+
+ bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
+
+ // Merge another DelayInfo into this one, to represent the union of the
+ // worst-case delays of each type.
+ void merge(const DelayInfo &RHS) {
+ VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
+ VALUNum = std::min(VALUNum, RHS.VALUNum);
+ TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
+ TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
+ TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
+ SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
+ }
+
+ // Update this DelayInfo after issuing an instruction. IsVALU should be 1
+ // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
+ // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
+ // instruction. Return true if there is no longer any useful delay info.
+ bool advance(DelayType Type, unsigned Cycles) {
+ bool Erase = true;
+
+ VALUNum += (Type == VALU);
+ if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
+ // Forget about the VALU instruction. It was too far back or has
+ // definitely completed by now.
+ VALUNum = VALU_MAX;
+ VALUCycles = 0;
+ } else {
+ VALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ TRANSNum += (Type == TRANS);
+ TRANSNumVALU += (Type == VALU);
+ if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
+ // Forget about any TRANS instruction. It was too far back or has
+ // definitely completed by now.
+ TRANSNum = TRANS_MAX;
+ TRANSNumVALU = VALU_MAX;
+ TRANSCycles = 0;
+ } else {
+ TRANSCycles -= Cycles;
+ Erase = false;
+ }
+
+ if (SALUCycles <= Cycles) {
+ // Forget about any SALU instruction. It has definitely completed by
+ // now.
+ SALUCycles = 0;
+ } else {
+ SALUCycles -= Cycles;
+ Erase = false;
+ }
+
+ return Erase;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump() const {
+ if (VALUCycles)
+ dbgs() << " VALUCycles=" << (int)VALUCycles;
+ if (VALUNum < VALU_MAX)
+ dbgs() << " VALUNum=" << (int)VALUNum;
+ if (TRANSCycles)
+ dbgs() << " TRANSCycles=" << (int)TRANSCycles;
+ if (TRANSNum < TRANS_MAX)
+ dbgs() << " TRANSNum=" << (int)TRANSNum;
+ if (TRANSNumVALU < VALU_MAX)
+ dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
+ if (SALUCycles)
+ dbgs() << " SALUCycles=" << (int)SALUCycles;
+ }
+#endif
+ };
+
+ // A map from regunits to the delay info for that regunit.
+ struct DelayState : DenseMap<unsigned, DelayInfo> {
+ // Merge another DelayState into this one by merging the delay info for each
+ // regunit.
+ void merge(const DelayState &RHS) {
+ for (const auto &KV : RHS) {
+ iterator It;
+ bool Inserted;
+ std::tie(It, Inserted) = insert(KV);
+ if (!Inserted)
+ It->second.merge(KV.second);
+ }
+ }
+
+ // Advance the delay info for each regunit, erasing any that are no longer
+ // useful.
+ void advance(DelayType Type, unsigned Cycles) {
+ iterator Next;
+ for (auto I = begin(), E = end(); I != E; I = Next) {
+ Next = std::next(I);
+ if (I->second.advance(Type, Cycles))
+ erase(I);
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void dump(const TargetRegisterInfo *TRI) const {
+ if (empty()) {
+ dbgs() << " empty\n";
+ return;
+ }
+
+ // Dump DelayInfo for each RegUnit in numerical order.
+ SmallVector<const_iterator, 8> Order;
+ Order.reserve(size());
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ Order.push_back(I);
+ llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
+ return A->first < B->first;
+ });
+ for (const_iterator I : Order) {
+ dbgs() << " " << printRegUnit(I->first, TRI);
+ I->second.dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
+ };
+
+ // The saved delay state at the end of each basic block.
+ DenseMap<MachineBasicBlock *, DelayState> BlockState;
+
+ // Emit an s_delay_alu instruction if necessary before MI.
+ MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
+ MachineInstr *LastDelayAlu) {
+ unsigned Imm = 0;
+
+ // Wait for a TRANS instruction.
+ if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
+ Imm |= 4 + Delay.TRANSNum;
+
+ // Wait for a VALU instruction (if it's more recent than any TRANS
+ // instruction that we're also waiting for).
+ if (Delay.VALUNum < DelayInfo::VALU_MAX &&
+ Delay.VALUNum <= Delay.TRANSNumVALU) {
+ if (Imm & 0xf)
+ Imm |= Delay.VALUNum << 7;
+ else
+ Imm |= Delay.VALUNum;
+ }
+
+ // Wait for an SALU instruction.
+ if (Delay.SALUCycles) {
+ if (Imm & 0x780) {
+ // We have already encoded a VALU and a TRANS delay. There's no room in
+ // the encoding for an SALU delay as well, so just drop it.
+ } else if (Imm & 0xf) {
+ Imm |= (Delay.SALUCycles + 8) << 7;
+ } else {
+ Imm |= Delay.SALUCycles + 8;
+ }
+ }
+
+ // Don't emit the s_delay_alu instruction if there's nothing to wait for.
+ if (!Imm)
+ return LastDelayAlu;
+
+ // If we only need to wait for one instruction, try encoding it in the last
+ // s_delay_alu that we emitted.
+ if (!(Imm & 0x780) && LastDelayAlu) {
+ unsigned Skip = 0;
+ for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
+ E = MachineBasicBlock::instr_iterator(MI);
+ ++I != E;) {
+ if (!I->isBundle() && !I->isMetaInstruction())
+ ++Skip;
+ }
+ if (Skip < 6) {
+ MachineOperand &Op = LastDelayAlu->getOperand(0);
+ unsigned LastImm = Op.getImm();
+ assert((LastImm & ~0xf) == 0 &&
+ "Remembered an s_delay_alu with no room for another delay!");
+ LastImm |= Imm << 7 | Skip << 4;
+ Op.setImm(LastImm);
+ return nullptr;
+ }
+ }
+
+ auto &MBB = *MI.getParent();
+ MachineInstr *DelayAlu =
+ BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
+ // Remember the s_delay_alu for next time if there is still room in it to
+ // encode another delay.
+ return (Imm & 0x780) ? nullptr : DelayAlu;
+ }
+
+ bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
+ DelayState State;
+ for (auto *Pred : MBB.predecessors())
+ State.merge(BlockState[Pred]);
+
+ LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
+ << "\n";
+ State.dump(TRI););
+
+ bool Changed = false;
+ MachineInstr *LastDelayAlu = nullptr;
+
+ // Iterate over the contents of bundles, but don't emit any instructions
+ // inside a bundle.
+ for (auto &MI : MBB.instrs()) {
+ if (MI.isBundle() || MI.isMetaInstruction())
+ continue;
+
+ // Ignore some more instructions that do not generate any code.
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_RETURN_TO_EPILOG:
+ continue;
+ }
+
+ DelayType Type = getDelayType(MI.getDesc().TSFlags);
+
+ if (instructionWaitsForVALU(MI)) {
+ // Forget about all outstanding VALU delays.
+ State = DelayState();
+ } else if (Type != OTHER) {
+ DelayInfo Delay;
+ // TODO: Scan implicit uses too?
+ for (const auto &Op : MI.explicit_uses()) {
+ if (Op.isReg()) {
+ // One of the operands of the writelane is also the output operand.
+ // This creates the insertion of redundant delays. Hence, we have to
+ // ignore this operand.
+ if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
+ continue;
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
+ auto It = State.find(*UI);
+ if (It != State.end()) {
+ Delay.merge(It->second);
+ State.erase(*UI);
+ }
+ }
+ }
+ }
+ if (Emit && !MI.isBundledWithPred()) {
+ // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
+ // just ignore them?
+ LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
+ }
+ }
+
+ if (Type != OTHER) {
+ // TODO: Scan implicit defs too?
+ for (const auto &Op : MI.defs()) {
+ unsigned Latency = SchedModel.computeOperandLatency(
+ &MI, MI.getOperandNo(&Op), nullptr, 0);
+ for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
+ State[*UI] = DelayInfo(Type, Latency);
+ }
+ }
+
+ // Advance by the number of cycles it takes to issue this instruction.
+ // TODO: Use a more advanced model that accounts for instructions that
+ // take multiple cycles to issue on a particular pipeline.
+ unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
+ // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
+ // instructions on the assumption that they will usually have to be issued
+ // twice?
+ State.advance(Type, Cycles);
+
+ LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
+ }
+
+ if (Emit) {
+ assert(State == BlockState[&MBB] &&
+ "Basic block state should not have changed on final pass!");
+ } else if (State != BlockState[&MBB]) {
+ BlockState[&MBB] = std::move(State);
+ Changed = true;
+ }
+ return Changed;
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
+ << "\n");
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasDelayAlu())
+ return false;
+
+ SII = ST.getInstrInfo();
+ TRI = ST.getRegisterInfo();
+
+ SchedModel.init(&ST);
+
+ // Calculate the delay state for each basic block, iterating until we reach
+ // a fixed point.
+ SetVector<MachineBasicBlock *> WorkList;
+ for (auto &MBB : reverse(MF))
+ WorkList.insert(&MBB);
+ while (!WorkList.empty()) {
+ auto &MBB = *WorkList.pop_back_val();
+ bool Changed = runOnMachineBasicBlock(MBB, false);
+ if (Changed)
+ WorkList.insert(MBB.succ_begin(), MBB.succ_end());
+ }
+
+ LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
+
+ // Make one last pass over all basic blocks to emit s_delay_alu
+ // instructions.
+ bool Changed = false;
+ for (auto &MBB : MF)
+ Changed |= runOnMachineBasicBlock(MBB, true);
+ return Changed;
+ }
+};
+
+} // namespace
+
+char AMDGPUInsertDelayAlu::ID = 0;
+
+char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
+
+INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
+ false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f00d7511965a8..137e7048390b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -272,6 +272,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
+// Enable GFX11+ s_delay_alu insertion
+static cl::opt<bool>
+ EnableInsertDelayAlu("amdgpu-enable-delay-alu",
+ cl::desc("Enable s_delay_alu insertion"),
+ cl::init(true), cl::Hidden);
+
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
@@ -363,6 +369,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
initializeSIModeRegisterPass(*PR);
@@ -1413,6 +1420,10 @@ void GCNPassConfig::addPreEmitPass() {
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
addPass(&PostRAHazardRecognizerID);
+
+ if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
+ addPass(&AMDGPUInsertDelayAluID);
+
addPass(&BranchRelaxationPassID);
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 50e82c1fa2fd6..717bd5f5c3eab 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -57,6 +57,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
+ AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 26919141a418e..ed557dc0d195b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -2,7 +2,7 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
index 95be8cdd9e426..6f80082de704d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - %s | FileCheck -check-prefix=GFX10NSA %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; GFX6-LABEL: gather4_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index d54b0c053e581..7b2122754e770 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -589,6 +589,7 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX11-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
index 32bd0c1248400..30d54ea230b6f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index 9ddb138d6f915..b7dc460be6474 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
@@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
@@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
@@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 273b119aa4752..e43dbe868a70f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -68,8 +68,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v7, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v8, v11
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v10, v9
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
@@ -133,8 +135,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v8, 0xffff, v9, v12
; GFX11-NEXT: v_and_or_b32 v7, 0xffff, v11, v10
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
@@ -235,8 +239,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX11-NEXT: v_readfirstlane_b32 s5, v12
; GFX11-NEXT: v_readfirstlane_b32 s6, v13
; GFX11-NEXT: v_readfirstlane_b32 s7, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v18, v19, v[15:17], v[5:7], v[8:10]], s[4:7]
@@ -359,8 +365,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
; GFX11-NEXT: v_readfirstlane_b32 s6, v11
; GFX11-NEXT: v_readfirstlane_b32 s7, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[11:12]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v13, v14, v[15:17], v[4:6]], s[4:7] a16
@@ -474,8 +482,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
; GFX11-NEXT: v_readfirstlane_b32 s5, v13
; GFX11-NEXT: v_readfirstlane_b32 s6, v14
; GFX11-NEXT: v_readfirstlane_b32 s7, v15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[12:13]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[14:15]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[19:20], v21, v[16:18], v[6:8], v[9:11]], s[4:7]
@@ -605,8 +615,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
; GFX11-NEXT: v_readfirstlane_b32 s5, v11
; GFX11-NEXT: v_readfirstlane_b32 s6, v12
; GFX11-NEXT: v_readfirstlane_b32 s7, v13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[14:15], v16, v[17:19], v[4:6]], s[4:7] a16
@@ -984,6 +996,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7
; GFX11-NEXT: s_movk_i32 s5, 0x102
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v10, s5
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@@ -1123,6 +1136,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6
; GFX11-NEXT: s_movk_i32 s5, 0x102
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v7, s5
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 2b1fef8972317..7d8d0b4bc60a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_saddsat_i7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 39d9393c37e21..51d6b055390a8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_ssubsat_i7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 6682204beccb3..65c8530bab1a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_uaddsat_i7:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index da3ed3fb5d95a..d3248d44258de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -3,7 +3,7 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_usubsat_i7:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 98e1053e6dd60..1945d4f6baf4b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -173,6 +173,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
@@ -180,6 +181,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_mul_i32 s2, s2, 5
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -190,6 +192,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -203,12 +206,14 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, s3
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -219,6 +224,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -404,6 +410,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB1_2
@@ -412,6 +419,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -423,6 +431,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
@@ -437,6 +446,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-NEXT: ; %bb.1:
@@ -444,6 +454,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, s2
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -456,6 +467,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_endpgm
@@ -700,17 +712,23 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -719,12 +737,14 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -747,6 +767,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -760,25 +781,31 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -796,6 +823,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,20 +990,27 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: v_permlane64_b32 v2, v1
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3
@@ -997,15 +1032,20 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3
@@ -1196,6 +1236,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
@@ -1204,6 +1245,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_mul_i32 s4, s4, 5
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v0, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1214,6 +1256,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
@@ -1228,12 +1271,14 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v0, s3
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1244,6 +1289,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
@@ -1461,6 +1507,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1164-NEXT: s_mov_b64 s[6:7], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
@@ -1485,10 +1532,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -1500,6 +1549,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-NEXT: ; %bb.1:
@@ -1522,10 +1572,12 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -1775,6 +1827,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB7_2
@@ -1782,6 +1835,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_mul_i32 s2, s2, 5
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1793,6 +1847,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -1806,12 +1861,14 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB7_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, s3
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1823,6 +1880,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -2010,6 +2068,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_2
@@ -2018,6 +2077,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s2
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2031,6 +2091,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_endpgm
@@ -2044,6 +2105,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_2
; GFX1132-NEXT: ; %bb.1:
@@ -2051,6 +2113,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mul_i32 s2, s0, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, s2
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2064,6 +2127,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_endpgm
@@ -2308,17 +2372,23 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -2327,12 +2397,14 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -2355,6 +2427,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -2368,25 +2441,31 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -2404,6 +2483,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -2570,20 +2650,27 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: v_permlane64_b32 v2, v1
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3
@@ -2605,15 +2692,20 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132-NEXT: s_mov_b32 exec_lo, s0
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-NEXT: s_mov_b32 s0, exec_lo
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3
@@ -2812,6 +2904,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
@@ -2820,6 +2913,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_mul_i32 s4, s4, 5
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mov_b32_e32 v0, s4
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2832,6 +2926,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
@@ -2847,12 +2942,14 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB11_2
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_mul_i32 s3, s3, 5
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_e32 v0, s3
; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2865,6 +2962,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
@@ -3092,6 +3190,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1164-NEXT: s_mov_b64 s[6:7], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
@@ -3122,6 +3221,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -3134,6 +3234,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
@@ -3157,10 +3258,12 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -3482,17 +3585,23 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, -1
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_mov_b32_e32 v3, -1
; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -3501,12 +3610,14 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -3529,6 +3640,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -3542,25 +3654,31 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, -1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -3578,6 +3696,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -3825,17 +3944,23 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -3844,12 +3969,14 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -3872,6 +3999,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -3885,25 +4013,31 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -3921,6 +4055,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -4168,17 +4303,23 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -4187,12 +4328,14 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -4215,6 +4358,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -4228,25 +4372,31 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -4264,6 +4414,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -4511,17 +4662,23 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -4530,12 +4687,14 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -4558,6 +4717,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -4571,25 +4731,31 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -4607,6 +4773,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -4797,6 +4964,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -4817,6 +4985,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -4830,6 +4999,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
@@ -4849,6 +5019,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
@@ -5098,17 +5269,23 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -5117,12 +5294,14 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -5145,6 +5324,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -5158,25 +5338,31 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -5194,6 +5380,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -5384,6 +5571,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5404,6 +5592,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -5417,6 +5606,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
@@ -5436,6 +5626,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
@@ -5685,17 +5876,23 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -5704,12 +5901,14 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -5732,6 +5931,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -5745,25 +5945,31 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -5781,6 +5987,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -5968,6 +6175,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5988,6 +6196,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
@@ -6001,6 +6210,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
@@ -6020,6 +6230,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
@@ -6269,17 +6480,23 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, -1
; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_mov_b32_e32 v3, -1
; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v2, s4
; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
@@ -6288,12 +6505,14 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: v_readlane_b32 s5, v1, 31
; GFX1164-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1164-NEXT: v_readlane_b32 s7, v1, 63
; GFX1164-NEXT: v_readlane_b32 s6, v1, 47
; GFX1164-NEXT: v_writelane_b32 v3, s5, 32
; GFX1164-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1164-NEXT: v_writelane_b32 v3, s6, 48
@@ -6316,6 +6535,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: v_readfirstlane_b32 s3, v0
; GFX1164-NEXT: v_mov_b32_e32 v0, v3
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -6329,25 +6549,31 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: v_mov_b32_e32 v1, -1
; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132-NEXT: v_mov_b32_e32 v3, -1
; GFX1132-NEXT: v_readlane_b32 s3, v1, 15
; GFX1132-NEXT: v_readlane_b32 s4, v1, 31
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s2, -1
; GFX1132-NEXT: v_writelane_b32 v3, s3, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: ; implicit-def: $vgpr0
@@ -6365,6 +6591,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1132-NEXT: v_readfirstlane_b32 s3, v0
; GFX1132-NEXT: v_mov_b32_e32 v0, v3
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -6552,6 +6779,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6572,6 +6800,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
@@ -6585,6 +6814,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo
@@ -6604,6 +6834,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 416a78225af71..92b4fc24fba9e 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -265,6 +265,7 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -342,6 +343,7 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX11-NEXT: v_min_f32_e32 v1, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -423,6 +425,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, f
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
@@ -1650,6 +1653,7 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, f
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -1788,6 +1792,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -1858,6 +1863,7 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspac
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -2582,6 +2588,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2664,6 +2671,7 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out,
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -2818,6 +2826,7 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -3298,6 +3307,7 @@ define amdgpu_kernel void @v_clamp_
diff _source_f32(float addrspace(1)* %out, flo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index bb96036f3c42d..70262d66a1951 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -476,8 +476,10 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: v_mov_b32_e32 v10, 1.0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8
; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v5, v4
; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v7, v4
diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
index 3b6d2eb625791..963b63ab84c3a 100644
--- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
+++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll
@@ -14,24 +14,31 @@ define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpC
; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15
; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15
; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7
; GCN-NEXT: v_and_b32_e32 v7, 1, v7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6
; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo
; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6]
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6]
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 32297e863a46b..3a058774c15a0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -67,6 +67,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@@ -154,6 +155,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@@ -241,6 +243,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc
@@ -311,6 +314,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@@ -327,6 +331,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@@ -401,6 +406,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@@ -418,6 +424,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@@ -492,6 +499,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@@ -509,6 +517,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@@ -580,6 +589,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc
@@ -597,6 +607,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@@ -671,6 +682,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
@@ -689,6 +701,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
@@ -763,6 +776,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
@@ -780,6 +794,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4
; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index c49c617bbe8a8..a0641558c193c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -54,6 +54,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-LABEL: zero_init_kernel:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -169,6 +170,7 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX11-PAL-LABEL: zero_init_kernel:
; GFX11-PAL: ; %bb.0:
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
+; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -231,6 +233,7 @@ define void @zero_init_foo() {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -304,6 +307,7 @@ define void @zero_init_foo() {
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
+; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -681,6 +685,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0
; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -743,6 +748,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0
; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
@@ -810,6 +816,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-NEXT: v_and_b32_e32 v1, 15, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -865,6 +872,7 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15
+; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1021,6 +1029,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -1148,6 +1157,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
+; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -1219,6 +1229,7 @@ define void @zero_init_small_offset_foo() {
; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
@@ -1300,6 +1311,7 @@ define void @zero_init_small_offset_foo() {
; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_mov_b32 s0, 0
+; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-PAL-NEXT: s_mov_b32 s1, s0
; GFX11-PAL-NEXT: s_mov_b32 s2, s0
; GFX11-PAL-NEXT: s_mov_b32 s3, s0
@@ -4217,6 +4229,7 @@ define amdgpu_ps void @large_offset() {
; GFX11-LABEL: large_offset:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
@@ -4317,6 +4330,7 @@ define amdgpu_ps void @large_offset() {
; GFX11-PAL-LABEL: large_offset:
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
new file mode 100644
index 0000000000000..570d31454cb23
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu.mir
@@ -0,0 +1,561 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s
+
+---
+name: valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_3:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_4
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_4:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_4)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# There's no encoding for VALU_DEP_5. A normal VALU instruction will have
+# completed already.
+---
+name: valu_dep_5
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_5:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: v_add_nc_u32_e32 v3, v3, v3
+ ; CHECK-NEXT: v_add_nc_u32_e32 v4, v4, v4
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ $vgpr3 = V_ADD_U32_e32 $vgpr3, $vgpr3, implicit $exec
+ $vgpr4 = V_ADD_U32_e32 $vgpr4, $vgpr4, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: trans32_dep_3
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_3:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# There's no encoding for TRANS32_DEP_4. A normal TRANS instruction will have
+# completed already.
+---
+name: trans32_dep_4
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_4:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: v_exp_f32_e32 v2, v2
+ ; CHECK-NEXT: v_exp_f32_e32 v3, v3
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr2 = V_EXP_F32_e32 $vgpr2, implicit $exec, implicit $mode
+ $vgpr3 = V_EXP_F32_e32 $vgpr3, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}salu_cycle_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ $sgpr0 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+# There's no need for SALU_CYCLE_2 here because the s_mov will have completed
+# already.
+---
+name: salu_cycle_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}salu_cycle_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ $sgpr0 = S_MOV_B32 0
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_same_trans32_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_same_trans32_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_exp_f32_e32 v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+ $vgpr0 = V_EXP_F32_e32 $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+# There's no need to encode the VALU depdendency because it will complete before
+# the TRANS.
+---
+name: trans32_dep_1_only
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}trans32_dep_1_only:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_exp_f32_e32 v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_EXP_F32_e32 $vgpr1, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
+...
+
+---
+name: valu_dep_1_same_salu_cycle_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_same_salu_cycle_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_mov_b32 s0, 0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, s0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $vgpr0 = V_ADD_U32_e32 $sgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_1_next_valu_dep_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_next_valu_dep_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: valu_dep_2_next_valu_dep_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_2_next_valu_dep_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+...
+
+# There's no need to encode a dependency for the second mul, because the
+# dependency for the first mul has already guaranteed that the add has
+# completed.
+---
+name: valu_dep_1_no_next_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_no_next_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v0, v0
+ ; CHECK-NEXT: v_mul_f32_e32 v2, v0, v0
+ $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+ $vgpr2 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+...
+
+# There's no need to encode a dependency for the second add, because the
+# dependency for the second mul has already guaranteed that a later VALU has
+# completed.
+---
+name: valu_dep_1_no_next_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_1_no_next_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+ $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
+ $vgpr1 = V_MUL_F32_e32 $vgpr1, $vgpr1, implicit $exec, implicit $mode
+ $vgpr0 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec, implicit $mode
+...
+
+# There are no wait states between an add/sub/cmp generating carry and an
+# add/sub/cndmask that consumes it, so no need to encode a dependency.
+
+---
+name: implicit_cmp_cndmask
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}implicit_cmp_cndmask:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, v0, v1
+ ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, vcc
+ implicit $vcc = V_CMP_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec
+ $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $vcc, implicit $exec
+...
+
+# TODO: There should be no s_delay_alu here.
+---
+name: explicit_cmp_cndmask
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}explicit_cmp_cndmask:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_cmp_eq_i32_e64 s[0:1], v0, v1
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+ $sgpr0_sgpr1 = V_CMP_EQ_I32_e64 $vgpr0, $vgpr1, implicit $exec
+ $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr3, 0, $vgpr4, $sgpr0_sgpr1, implicit $exec
+...
+
+---
+name: implicit_addc_addc
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}implicit_addc_addc:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v0, vcc, v0, v0, vcc
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+ $vgpr0 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+---
+name: explicit_addc_addc
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}explicit_addc_addc:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_co_u32 v0, vcc, v0, v0
+ ; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v1, vcc
+ $vgpr0,$vcc = V_ADD_CO_U32_e64 $vgpr0, $vgpr0, 0, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+...
+
+---
+name: valu_dep_3_bundle
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}valu_dep_3_bundle:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v1, v1
+ ; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v2
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ BUNDLE {
+ $vgpr1 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr2 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+ }
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: if
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}if:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB23_2
+ ; CHECK-NEXT: %bb.1:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB23_2:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ bb.1:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ bb.2:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: else
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}else:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB24_2
+ ; CHECK-NEXT: %bb.1
+ ; CHECK-NEXT: s_branch .LBB24_3
+ ; CHECK-NEXT: .LBB24_2:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB24_3:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ bb.1:
+ S_BRANCH %bb.3
+ bb.2:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ bb.3:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+---
+name: if_else
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}if_else:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: s_cbranch_vccz .LBB25_2
+ ; CHECK-NEXT: %bb.1:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_branch .LBB25_3
+ ; CHECK-NEXT: .LBB25_2:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v1
+ ; CHECK-NEXT: .LBB25_3:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ bb.1:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_BRANCH %bb.3
+ bb.2:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ $vgpr0 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
+ bb.3:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# Dependency from outside the loop.
+---
+name: loop_1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}loop_1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: .LBB26_1:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v1, v0, v0
+ ; CHECK-NEXT: s_cbranch_vccz .LBB26_1
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ bb.1:
+ $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ bb.2:
+...
+
+# Dependency from inside the loop.
+---
+name: loop_2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}loop_2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: .LBB27_1:
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ ; CHECK-NEXT: s_cbranch_vccz .LBB27_1
+ bb.1:
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ bb.2:
+...
+
+# No VALU delay across s_sendmsg_rtn because it waits for all outstanding VALU
+# to complete.
+---
+name: sendmsg_rtn
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}sendmsg_rtn:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+ ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+ ; CHECK-NEXT: s_add_u32 s0, s0, s0
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_SENDMSG_RTN_B32 128
+ $sgpr0 = S_ADD_U32 $sgpr0, $sgpr0, implicit-def $scc
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# No VALU delay before or across FLAT because it waits for all outstanding VALU
+# to complete.
+---
+name: flat_load
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}flat_load:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
+ ; CHECK-NEXT: v_mov_b32_e32 v2, 0
+ ; CHECK-NEXT: flat_load_b32 v0, v[0:1]
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v2, v2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec
+...
+
+# No VALU delay across an s_waitcnt_depctr that waits for all outstanding VALU
+# to complete.
+---
+name: waitcnt_depctr
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}waitcnt_depctr:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_mov_b32_e32 v0, 0
+ ; CHECK-NEXT: s_waitcnt_depctr 0xfff
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT_DEPCTR 4095
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
+
+# Check that no delays are emitted for writelane instructions.
+---
+name: writelane1
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}writelane1:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 0
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 1
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 2
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+ $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0
+ $vgpr0 = V_WRITELANE_B32 $sgpr0, 1, $vgpr0
+ $vgpr0 = V_WRITELANE_B32 $sgpr0, 2, $vgpr0
+ $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
+...
+
+# Check if a VALU delay is added after writelane.
+---
+name: writelane2
+body: |
+ bb.0:
+ ; CHECK-LABEL: {{^}}writelane2:
+ ; CHECK: %bb.0:
+ ; CHECK-NEXT: v_writelane_b32 v0, s0, 3
+ ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+ ; CHECK-NEXT: v_add_nc_u32_e32 v0, v0, v0
+ $vgpr0 = V_WRITELANE_B32 $sgpr0, 3, $vgpr0
+ $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 3db196879bc58..01135bdf80608 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -388,6 +388,7 @@
; GCN-O1-NEXT: SI Final Branch Preparation
; GCN-O1-NEXT: SI peephole optimizations
; GCN-O1-NEXT: Post RA hazard recognizer
+; GCN-O1-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-NEXT: Branch relaxation pass
; GCN-O1-NEXT: Register Usage Information Collector Pass
; GCN-O1-NEXT: Live DEBUG_VALUE analysis
@@ -676,6 +677,7 @@
; GCN-O1-OPTS-NEXT: SI Final Branch Preparation
; GCN-O1-OPTS-NEXT: SI peephole optimizations
; GCN-O1-OPTS-NEXT: Post RA hazard recognizer
+; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU
; GCN-O1-OPTS-NEXT: Branch relaxation pass
; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass
; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis
@@ -966,6 +968,7 @@
; GCN-O2-NEXT: SI Final Branch Preparation
; GCN-O2-NEXT: SI peephole optimizations
; GCN-O2-NEXT: Post RA hazard recognizer
+; GCN-O2-NEXT: AMDGPU Insert Delay ALU
; GCN-O2-NEXT: Branch relaxation pass
; GCN-O2-NEXT: Register Usage Information Collector Pass
; GCN-O2-NEXT: Live DEBUG_VALUE analysis
@@ -1268,6 +1271,7 @@
; GCN-O3-NEXT: SI Final Branch Preparation
; GCN-O3-NEXT: SI peephole optimizations
; GCN-O3-NEXT: Post RA hazard recognizer
+; GCN-O3-NEXT: AMDGPU Insert Delay ALU
; GCN-O3-NEXT: Branch relaxation pass
; GCN-O3-NEXT: Register Usage Information Collector Pass
; GCN-O3-NEXT: Live DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
index a10ea31f2cbcc..7453c3685e467 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.row.ll
@@ -86,6 +86,7 @@ define amdgpu_kernel void @id_row_i32() #0 {
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: s_mov_b32 m0, s0
; GFX11-SDAG-NEXT: exp pos0 v0, off, off, off done row_en
; GFX11-SDAG-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index e5190708fba2c..843deff83ab9a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -15,6 +15,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -43,6 +44,7 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index 783f613ac349c..1548e32a265d4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -4,7 +4,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=NOPRT %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
; VERDE-LABEL: load_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index 37c49c287536f..033bbdd0a9545 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX9-LABEL: gather4_2d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index b1571bf2ec89b..6073f74f31656 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
; GFX9-LABEL: sample_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index d21c9adb8e96d..17809fe040779 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; TONGA-LABEL: image_sample_2d_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 4b9748d41a730..c0f85e78ce02b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -2,7 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=VERDE %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6789 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
; VERDE-LABEL: sample_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index 7113d7b2f0222..657fd54a56094 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -34,6 +34,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
@@ -62,8 +63,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX11-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x93,0x00,0x87,0xbf]
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x09,0x04]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x08,0x0f,0xe4,0xf0,0x02,0x00,0x00,0x08]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
@@ -105,6 +108,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x56,0xd6,0x04,0x21,0x0d,0x04]
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x56,0xd6,0x02,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
@@ -147,6 +151,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x56,0xd6,0x03,0x21,0x09,0x04]
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x56,0xd6,0x01,0x21,0x01,0x04]
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
@@ -193,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x56,0xd6,0x04,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x56,0xd6,0x08,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x04,0x0f,0x50,0xf1,0x02,0x00,0x00,0x08]
@@ -226,6 +232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x04,0xf0,0xf0,0x02,0x00,0x00,0x08]
@@ -259,6 +266,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX11-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; encoding: [0x12,0x01,0x87,0xbf]
; GFX11-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x56,0xd6,0x05,0x21,0x01,0x04]
; GFX11-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x56,0xd6,0x0a,0x21,0x05,0x04]
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x14,0x06,0xf0,0xf0,0x02,0x00,0x00,0x08]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
index 8e6d8ac804c2e..0fed234068975 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index f6ebaf268b36b..864d1f17641a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -12,9 +12,11 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
@@ -42,13 +44,16 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
@@ -86,8 +91,10 @@ define amdgpu_ps void @v_interp_f32_many_vm(float addrspace(1)* %ptr, i32 inreg
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
@@ -123,9 +130,11 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 36cb78241b01f..a3786758d2156 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -233,6 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: v_mov_b32_e32 v8, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@@ -325,6 +326,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node
; GFX11-NEXT: v_mov_b32_e32 v5, 2.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4
@@ -428,6 +430,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray,
; GFX11-NEXT: v_mov_b32_e32 v10, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
@@ -515,6 +518,7 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_
; GFX11-NEXT: v_mov_b32_e32 v7, 0x102
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 8f1bd7d2c0281..fca7fdae148f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -14,6 +14,7 @@ define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
@@ -28,6 +29,7 @@ define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index d1f3955f56515..378af816dbad0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -83,20 +83,25 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v8, v1
; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2
; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
@@ -223,31 +228,40 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v5, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v8, v1
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
@@ -372,6 +386,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s1, s1, s6
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -548,8 +563,10 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_lt_i32 s3, 0
; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_add_i32 s1, s8, s7
@@ -558,7 +575,9 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b32 s5, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo
@@ -617,9 +636,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -677,8 +698,10 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
; GFX11-NEXT: v_mov_b32_e32 v6, v0
; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 55a0d65fbafff..fb370ab7b61d2 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -34,6 +34,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
@@ -71,6 +72,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i64
@@ -108,6 +110,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
@@ -145,6 +148,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = zext i32 %arg0 to i64
@@ -248,22 +252,29 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
; GFX11-NEXT: v_mov_b32_e32 v7, v10
; GFX11-NEXT: v_mov_b32_e32 v10, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
; GFX11-NEXT: v_mov_b32_e32 v8, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
; GFX11-NEXT: v_mov_b32_e32 v7, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i128
@@ -301,6 +312,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i32 %arg0 to i63
@@ -346,6 +358,7 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_bfe_i32 v4, v1, 0, 31
; GFX11-NEXT: v_bfe_i32 v5, v0, 0, 31
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext0 = sext i31 %arg0 to i63
@@ -394,9 +407,11 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5
; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%ext0 = sext i32 %arg0 to i64
@@ -433,6 +448,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
@@ -481,8 +497,10 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, v0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v5, 1, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v4, v1
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -532,9 +550,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
; GFX11-NEXT: v_and_b32_e32 v4, 1, v3
; GFX11-NEXT: v_mov_b32_e32 v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%trunc.lhs = and i64 %arg0, 4294967295
@@ -571,6 +591,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl.lhs = shl i64 %arg0, 32
@@ -610,6 +631,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -731,6 +753,7 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
; GFX11-NEXT: v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2
; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -794,14 +817,17 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[8:9], null, v0, v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -852,8 +878,10 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_i64_i32 v[4:5], null, v0, v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -908,9 +936,11 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mov_b32_e32 v6, v1
; GFX11-NEXT: v_mov_b32_e32 v7, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3
; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%m = mul i48 %arg0, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index 35e1cad76e445..4f52ead158b2e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s
define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
; GFX9-LABEL: mad_i32_vvv:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index aeb5a7947492d..ed140d8c2e933 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -269,6 +269,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc
@@ -283,6 +284,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc
@@ -561,6 +563,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@@ -575,6 +578,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 768978ebe6364..e1729e0bfd1c2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -165,6 +165,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
@@ -180,6 +181,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
@@ -359,6 +361,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
@@ -374,6 +377,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
More information about the llvm-commits
mailing list