[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 22:05:52 PDT 2024
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/98864
>From b290cededa43c04b45b048a8dc01c0a49e99fde8 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 14 Jul 2024 17:31:13 +0900
Subject: [PATCH 1/2] [AMDGPU] V_SET_INACTIVE optimizations
Optimize V_SET_INACTIVE by always running it in run in WWM.
Allows WWM sections to be unbroken, and facilitates V_SET_INACTIVE
to be be lower to V_CNDMASK in most cases.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.
Additionally:
- Avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
- Lower any V_SET_INACTIVE not touched by marking to COPY.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 183 +-
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 68 +-
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 401 ++--
.../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 30 +-
.../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 29 +-
.../atomic_optimizations_global_pointer.ll | 588 ++---
.../atomic_optimizations_local_pointer.ll | 2031 +++++++----------
.../atomic_optimizations_pixelshader.ll | 66 +-
llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll | 18 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 602 ++---
.../AMDGPU/global_atomics_scan_fmax.ll | 428 ++--
.../AMDGPU/global_atomics_scan_fmin.ll | 428 ++--
.../AMDGPU/global_atomics_scan_fsub.ll | 602 ++---
.../llvm.amdgcn.set.inactive.chain.arg.ll | 389 ++--
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 382 ++--
.../AMDGPU/set-inactive-wwm-overwrite.ll | 12 +-
.../AMDGPU/should-not-hoist-set-inactive.ll | 5 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 64 +-
llvm/test/CodeGen/AMDGPU/wqm.ll | 52 +-
llvm/test/CodeGen/AMDGPU/wqm.mir | 4 +-
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 704 +++---
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 504 ++--
.../MIR/AMDGPU/machine-function-info.ll | 7 +-
24 files changed, 3243 insertions(+), 4368 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a857bdba53c3e8..6485b188e8422a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ // Find implicit register defining lanes active outside WWM.
+ // Note: default here is set to ExecReg so that functional MIR is still
+ // generated if implicit def is not found and assertions are disabled.
+ Register ExecSrcReg = ExecReg;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+ assert(ExecSrcReg != ExecReg &&
+ "V_SET_INACTIVE must be in known WWM region");
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+
+ const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+ const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+ const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+ const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+ const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+ const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses =
+ 1 + // Starts at 1 for ExecSrcReg
+ (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+ (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+ int LiteralConstants =
+ (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
+ (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+
+ bool UseVCndMask =
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+ if (VMov64 && UseVCndMask) {
+ // Decomposition must not introduce new literals.
+ UseVCndMask &=
+ ActiveSrc.isReg() ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+ (!isInlineConstant(ActiveImm));
+ UseVCndMask &= InactiveSrc.isReg() ||
+ (isInlineConstant(InactiveImmLo) &&
+ isInlineConstant(InactiveImmLo)) ||
+ (!isInlineConstant(InactiveImm));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // Dual V_CNDMASK_B32
+ MachineOperand ActiveLo =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
+ MachineOperand ActiveHi =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
+ : MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
+ MachineOperand InactiveLo =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
+ MachineOperand InactiveHi =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
+ : MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
+ .addImm(0)
+ .add(InactiveLo)
+ .addImm(0)
+ .add(ActiveLo)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
+ .addImm(0)
+ .add(InactiveHi)
+ .addImm(0)
+ .add(ActiveHi)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ } else if (UseVCndMask) {
+ // Single V_CNDMASK_B32
+ BuildMI(MBB, MI, DL, get(Opcode), DstReg)
+ .addImm(0)
+ .add(InactiveSrc)
+ .addImm(0)
+ .add(ActiveSrc)
+ .addReg(ExecSrcReg);
+ } else {
+ // Fallback V_MOV case.
+ // Avoid unnecessary work if a source VGPR is also the destination.
+ // This can happen if WWM register allocation was efficient.
+ // Note: this assumes WWM execution.
+ bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+ bool DstIsInactive =
+ InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+ if (!DstIsInactive) {
+ // Set exec mask to inactive lanes,
+ // but only if active lanes would be overwritten.
+ if (DstIsActive) {
+ MachineInstr *ExecMI =
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
+ ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ }
+ // Copy inactive lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ if (!DstIsActive) {
+ // Set exec mask to active lanes
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+ // Copy active lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+ .add(ActiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ // Restore WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+ }
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 9a51cbbb9f6b8e..fe2b40db1d4ea8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -128,6 +128,7 @@ struct InstrInfo {
char Needs = 0;
char Disabled = 0;
char OutNeeds = 0;
+ char MarkedStates = 0;
};
struct BlockInfo {
@@ -175,9 +176,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
- SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+ SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;
+ SmallVector<MachineInstr *, 4> SetInactiveInstrs;
void printInfo();
@@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
+ // Capture all states requested in marking including disabled ones.
+ II.MarkedStates |= Flag;
+
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
- SmallVector<MachineInstr *, 4> SetInactiveInstrs;
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
bool HasImplicitDerivatives =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.
Flags = StateWQM;
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
} else if (Opcode == AMDGPU::SOFT_WQM) {
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
SoftWQMInstrs.push_back(&MI);
} else if (Opcode == AMDGPU::STRICT_WWM) {
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
@@ -555,16 +559,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+ // Disable strict states; StrictWQM will be added as required later.
III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
if (Inactive.isUndef()) {
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
} else {
markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
}
SetInactiveInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1042,6 +1048,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
SmallVector<MachineInstr *, 4> SplitPoints;
+ Register ActiveLanesReg = 0;
char State = BI.InitialState;
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1058,6 +1065,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
+ case AMDGPU::ENTER_STRICT_WWM:
+ ActiveLanesReg = MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::EXIT_STRICT_WWM:
+ ActiveLanesReg = 0;
+ break;
+ case AMDGPU::V_SET_INACTIVE_B32:
+ case AMDGPU::V_SET_INACTIVE_B64:
+ if (ActiveLanesReg) {
+ MI.addOperand(*MBB.getParent(),
+ MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ } else
+ assert(State == StateExact || State == StateWQM);
+ break;
default:
break;
}
@@ -1497,13 +1518,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
+ LLVM_DEBUG(dbgs() << "simplify: " << *MI);
+
+ Register RecomputeReg = 0;
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
assert(MI->getNumExplicitOperands() == 3);
- // the only reason we should be here is V_SET_INACTIVE has
- // an undef input so it is being replaced by a simple copy.
- // There should be a second undef source that we should remove.
- assert(MI->getOperand(2).isUndef());
+ if (MI->getOperand(2).isReg())
+ RecomputeReg = MI->getOperand(2).getReg();
MI->removeOperand(2);
MI->untieRegOperand(1);
} else {
@@ -1514,7 +1536,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
? (unsigned)AMDGPU::COPY
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
*MRI, MI->getOperand(0)));
+ int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+ while (Index >= 0) {
+ MI->removeOperand(Index);
+ Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+ }
+
MI->setDesc(TII->get(CopyOp));
+ LLVM_DEBUG(dbgs() << " -> " << *MI);
+
+ if (RecomputeReg) {
+ LIS->removeInterval(RecomputeReg);
+ LIS->createAndComputeVirtRegInterval(RecomputeReg);
+ }
}
return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
}
@@ -1656,6 +1690,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
+ SetInactiveInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1712,6 +1747,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
}
+ // Check if V_SET_INACTIVE was touched by a strict state mode.
+ // If so, promote to WWM; otherwise lower to COPY.
+ for (MachineInstr *MI : SetInactiveInstrs) {
+ if (LowerToCopyInstrs.contains(MI))
+ continue;
+ if (Instructions[MI].MarkedStates & StateStrict) {
+ Instructions[MI].Needs |= StateStrictWWM;
+ Instructions[MI].Disabled &= ~StateStrictWWM;
+ Blocks[MI->getParent()].Needs |= StateStrictWWM;
+ } else {
+ LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
+ LowerToCopyInstrs.insert(MI);
+ }
+ }
+
LLVM_DEBUG(printInfo());
Changed |= lowerLiveMaskQueries();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 8f88aaedf7e95e..137366a45cbdfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,18 +4,39 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
+ store i32 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -24,18 +45,42 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
+ store i64 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison_64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -45,39 +90,43 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
-; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c
+; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0
+; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s4, 56
+; GCN-NEXT: s_cmp_lg_u32 s6, 56
; GCN-NEXT: s_cselect_b32 s3, 1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, 1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_cmp_lg_u32 s3, 0
-; GCN-NEXT: s_cbranch_scc0 .LBB2_2
+; GCN-NEXT: s_cbranch_scc0 .LBB4_2
; GCN-NEXT: ; %bb.1: ; %.one
-; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s2, 0
-; GCN-NEXT: .LBB2_2: ; %Flow
+; GCN-NEXT: .LBB4_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB2_4
+; GCN-NEXT: s_cbranch_scc1 .LBB4_4
; GCN-NEXT: ; %bb.3: ; %.zero
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT: .LBB2_4: ; %.exit
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: .LBB4_4: ; %.exit
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
br i1 %cmp, label %.zero, label %.one
.zero:
@@ -96,19 +145,22 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
store float %tmp, ptr addrspace(1) %out
ret void
}
@@ -117,20 +169,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v3, 0x4010cccc
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
store double %tmp, ptr addrspace(1) %out
ret void
}
@@ -138,19 +193,22 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
store <2 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -158,19 +216,22 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
store <2 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -179,22 +240,25 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
store <2 x i32> %tmp, ptr addrspace(1) %out
ret void
}
@@ -203,22 +267,25 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1.0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1.0
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
store <2 x float> %tmp, ptr addrspace(1) %out
ret void
}
@@ -226,19 +293,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
store <2 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -247,22 +317,25 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x10001
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x10001
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
store <4 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -271,22 +344,25 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
store <4 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -295,22 +371,25 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
store <4 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -319,18 +398,23 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
store ptr %tmp, ptr addrspace(1) %out
ret void
}
@@ -338,18 +422,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
ret void
}
@@ -357,18 +445,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
ret void
}
@@ -376,18 +468,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
ret void
}
@@ -395,24 +491,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index c92b78cd45573a..e34ae52fc673ab 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -571,11 +571,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1
; GISEL-GFX11-NEXT: ;;#ASMSTART
; GISEL-GFX11-NEXT: s_nop
@@ -591,10 +590,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
@@ -609,11 +607,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; DAGISEL-GFX11-LABEL: chain_to_chain_wwm:
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1
; DAGISEL-GFX11-NEXT: ;;#ASMSTART
@@ -629,11 +626,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; DAGISEL-GFX10-LABEL: chain_to_chain_wwm:
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 8d9ed9bb4343c6..320268564f4dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -329,10 +329,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX11-NEXT: ;;#ASMSTART
; GISEL-GFX11-NEXT: s_nop
; GISEL-GFX11-NEXT: ;;#ASMEND
@@ -351,10 +351,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
; GISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
; GISEL-GFX10-NEXT: ;;#ASMEND
@@ -371,11 +370,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX11-NEXT: ;;#ASMSTART
; DAGISEL-GFX11-NEXT: s_nop
; DAGISEL-GFX11-NEXT: ;;#ASMEND
@@ -393,11 +391,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
; DAGISEL-GFX10-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 16f3ff4be6b501..14e1f9031d6ae2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1133,11 +1133,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1186,11 +1184,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1233,13 +1229,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: add_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1297,11 +1290,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: add_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1352,28 +1342,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-LABEL: add_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -1427,37 +1415,34 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-LABEL: add_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -1491,28 +1476,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-LABEL: add_i32_varying:
; GFX1264_DPP: ; %bb.0: ; %entry
; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -1566,37 +1549,34 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-LABEL: add_i32_varying:
; GFX1232_DPP: ; %bb.0: ; %entry
; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -2894,15 +2874,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2994,15 +2968,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3091,29 +3059,23 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -3122,8 +3084,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -3133,8 +3095,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
@@ -3206,29 +3168,23 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -3237,8 +3193,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -3303,59 +3259,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
@@ -3424,58 +3374,52 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
@@ -3522,23 +3466,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3643,57 +3581,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
@@ -4864,11 +4796,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4917,11 +4847,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4964,13 +4892,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: sub_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5028,11 +4953,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: sub_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5083,28 +5005,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-LABEL: sub_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -5158,37 +5078,34 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-LABEL: sub_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -5222,28 +5139,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-LABEL: sub_i32_varying:
; GFX1264_DPP: ; %bb.0: ; %entry
; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -5297,37 +5212,34 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-LABEL: sub_i32_varying:
; GFX1232_DPP: ; %bb.0: ; %entry
; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
@@ -6667,15 +6579,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6767,15 +6673,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6864,29 +6764,23 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -6895,8 +6789,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -6906,8 +6800,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
@@ -6979,29 +6873,23 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7010,8 +6898,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -7076,59 +6964,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
@@ -7197,58 +7079,52 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
@@ -7295,23 +7171,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7416,57 +7286,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v2
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
; GFX1232_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s7, v4, 15
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1232_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s6
-; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1232_DPP-NEXT: v_writelane_b32 v1, s7, 16
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index c7296185422cef..364af4dc29b1a7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -780,14 +780,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -827,14 +825,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -870,13 +866,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: add_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -926,13 +919,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: add_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -973,34 +963,32 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: add_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -1040,28 +1028,26 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: add_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -1319,11 +1305,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1354,11 +1338,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1385,11 +1367,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1064_DPP-LABEL: add_i32_varying_nouse:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1419,11 +1398,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1032_DPP-LABEL: add_i32_varying_nouse:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1448,31 +1424,28 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_DPP-LABEL: add_i32_varying_nouse:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -1490,28 +1463,25 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_DPP-LABEL: add_i32_varying_nouse:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132_DPP-NEXT: ; %bb.1:
@@ -2408,15 +2378,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2503,15 +2467,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2595,29 +2553,23 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -2626,8 +2578,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -2637,8 +2589,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
@@ -2701,32 +2653,26 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -2735,8 +2681,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -2748,10 +2694,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -2793,62 +2739,55 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2908,56 +2847,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
@@ -3253,15 +3187,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3330,15 +3258,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3404,23 +3326,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3471,22 +3387,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3529,21 +3441,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
@@ -3600,22 +3506,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4428,14 +4329,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4475,14 +4374,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4518,13 +4415,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: sub_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4574,13 +4468,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: sub_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4621,34 +4512,32 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: sub_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -4688,28 +4577,26 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: sub_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -4967,11 +4854,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5002,11 +4887,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5033,11 +4916,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5067,11 +4947,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1032_DPP-LABEL: sub_i32_varying_nouse:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5096,31 +4973,28 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_DPP-LABEL: sub_i32_varying_nouse:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -5138,28 +5012,25 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_DPP-LABEL: sub_i32_varying_nouse:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1132_DPP-NEXT: ; %bb.1:
@@ -6082,15 +5953,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6177,15 +6042,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6269,29 +6128,23 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -6300,8 +6153,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -6311,8 +6164,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, s4
@@ -6375,32 +6228,26 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -6409,8 +6256,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v4
@@ -6422,10 +6269,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -6467,62 +6314,55 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v7, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s5
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6582,56 +6422,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v6, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
@@ -6989,13 +6824,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7038,13 +6869,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7083,13 +6910,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: and_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7139,13 +6963,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: and_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7186,34 +7007,32 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: and_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -7253,33 +7072,30 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: and_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -7670,16 +7486,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
@@ -7734,16 +7544,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
@@ -7792,19 +7596,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: and_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7875,19 +7675,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: and_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7901,11 +7697,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -7940,50 +7736,46 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: and_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -8039,43 +7831,39 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: and_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8435,14 +8223,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8482,14 +8268,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8525,13 +8309,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: or_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8581,13 +8362,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: or_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8628,34 +8406,32 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: or_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -8695,28 +8471,26 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: or_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -9111,16 +8885,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
@@ -9175,16 +8943,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
@@ -9233,19 +8995,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: or_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9316,19 +9074,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: or_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9342,11 +9096,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -9381,50 +9135,46 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: or_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -9480,43 +9230,39 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: or_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9876,14 +9622,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9923,14 +9667,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9966,13 +9708,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: xor_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10022,13 +9761,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: xor_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10069,34 +9805,32 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: xor_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -10136,28 +9870,26 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: xor_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -10552,16 +10284,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
@@ -10616,16 +10342,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
@@ -10674,19 +10394,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: xor_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10757,19 +10473,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: xor_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10783,11 +10495,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -10822,50 +10534,46 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: xor_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -10921,43 +10629,39 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: xor_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11318,12 +11022,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11367,12 +11070,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11411,13 +11113,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: max_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11467,13 +11166,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: max_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0
; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11514,34 +11210,32 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: max_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -11581,33 +11275,30 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: max_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0
; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -12286,19 +11977,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: max_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: s_mov_b32 s0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX8_DPP-NEXT: s_brev_b32 s1, 1
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -12385,19 +12076,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: max_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: s_mov_b32 s0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9_DPP-NEXT: s_brev_b32 s1, 1
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -12484,20 +12175,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-LABEL: max_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: s_mov_b32 s0, 0
-; GFX1064_DPP-NEXT: s_brev_b32 s1, 1
+; GFX1064_DPP-NEXT: s_mov_b32 s4, 0
+; GFX1064_DPP-NEXT: s_brev_b32 s5, 1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -12608,20 +12293,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
@@ -12705,80 +12384,73 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12842,57 +12514,49 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
@@ -13258,12 +12922,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13307,12 +12970,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13351,13 +13013,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: min_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13407,13 +13066,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: min_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13454,34 +13110,32 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: min_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -13521,33 +13175,30 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: min_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -14229,16 +13880,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: s_mov_b32 s6, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b32 s6, -1
; GFX8_DPP-NEXT: s_brev_b32 s7, -2
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -14326,16 +13977,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: s_mov_b32 s6, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b32 s6, -1
; GFX9_DPP-NEXT: s_brev_b32 s7, -2
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -14426,14 +14077,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14540,17 +14185,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14639,80 +14278,73 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14775,58 +14407,50 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15190,14 +14814,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15237,14 +14859,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15280,13 +14900,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: umax_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15336,13 +14953,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umax_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15383,34 +14997,32 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: umax_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -15450,28 +15062,26 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: umax_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -16151,14 +15761,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
@@ -16249,14 +15853,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
@@ -16345,13 +15943,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -16457,20 +16051,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16555,80 +16143,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16689,58 +16270,51 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
@@ -17105,13 +16679,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17154,13 +16724,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17199,13 +16765,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: umin_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17255,13 +16818,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umin_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17302,34 +16862,32 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: umin_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -17369,33 +16927,30 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: umin_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -18071,14 +17626,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
@@ -18169,14 +17718,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
@@ -18265,13 +17808,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -18377,20 +17916,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18475,80 +18008,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v3
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18609,58 +18135,51 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v6, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v5 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index f67fcd6e0caf53..d776f89767d072 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX8-NEXT: s_mov_b64 exec, s[10:11]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX9-NEXT: s_mov_b64 exec, s[10:11]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -450,13 +443,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1032-NEXT: s_and_saveexec_b32 s8, s9
; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s9, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -505,34 +495,31 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_not_b64 exec, exec
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
-; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT: v_readlane_b32 s12, v1, 31
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_readlane_b32 s12, v1, 31
; GFX1164-NEXT: v_mov_b32_e32 v2, s12
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s12, v1, 15
; GFX1164-NEXT: v_readlane_b32 s13, v1, 31
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_writelane_b32 v3, s12, 16
; GFX1164-NEXT: s_mov_b64 exec, s[10:11]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1164-NEXT: v_readlane_b32 s12, v1, 63
@@ -581,33 +568,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: s_and_saveexec_b32 s8, s9
; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_readlane_b32 s11, v1, 31
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_readlane_b32 s10, v1, 15
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_writelane_b32 v3, s10, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 0d74bd39b56fec..7aca63d34f51bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: s_mov_b32 exec_lo, s4
; GCN-NEXT: s_or_saveexec_b32 s4, -1
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s4
-; GCN-NEXT: v_mov_b32_e32 v3, v0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s4, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s4
; GCN-NEXT: v_mov_b32_e32 v5, 0
@@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_or_saveexec_b32 s5, -1
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s5
-; GCN-NEXT: v_mov_b32_e32 v3, v0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s5, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s5
; GCN-NEXT: v_mov_b32_e32 v5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 82dc6d21cfe33d..310f32ce8f83bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -6,16 +6,13 @@
define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
; GCN-LABEL: wwm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_mov_b32 s5, s2
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_mov_b32 s1, 1
-; GCN-NEXT: v_mov_b32_e32 v0, 4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s0, 0
@@ -63,16 +60,13 @@ work:
define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
; GCN-LABEL: strict_wwm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_mov_b32 s5, s2
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_mov_b32 s1, 1
-; GCN-NEXT: v_mov_b32_e32 v0, 4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 44cd2c6e3af675..b84eed19645581 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,12 +816,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -900,14 +898,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -982,14 +975,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1048,41 +1036,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -1117,15 +1099,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2054,12 +2031,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2138,14 +2113,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2220,14 +2190,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2286,41 +2251,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -2355,15 +2314,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3352,12 +3306,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3436,14 +3388,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3518,14 +3465,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3584,41 +3526,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -3653,15 +3589,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4146,12 +4077,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4230,14 +4159,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4312,14 +4236,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4378,41 +4297,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -4447,15 +4360,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5469,12 +5377,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5553,14 +5459,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5635,14 +5536,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5701,41 +5597,35 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -5783,15 +5673,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7467,14 +7352,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -7604,15 +7483,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7734,15 +7607,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7848,17 +7715,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7896,11 +7758,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -7978,16 +7840,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9081,14 +8938,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -9186,15 +9037,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9282,15 +9127,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9362,17 +9201,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9407,9 +9241,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -9460,16 +9295,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10540,14 +10370,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -10645,15 +10469,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10741,15 +10559,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10821,17 +10633,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10866,9 +10673,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -10919,16 +10727,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11481,14 +11284,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -11586,15 +11383,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11682,15 +11473,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11762,17 +11547,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11807,9 +11587,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -11860,16 +11641,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13587,14 +13363,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -13724,15 +13494,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13854,15 +13618,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13968,17 +13726,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14016,11 +13769,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -14098,16 +13851,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index f0196fadc4b3fd..4ddb48d4587fd2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -889,15 +882,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -949,48 +937,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1027,44 +1009,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -1783,12 +1760,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1876,15 +1851,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1954,15 +1924,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2014,48 +1979,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2092,44 +2051,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -2848,12 +2802,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2941,15 +2893,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3019,15 +2966,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3079,48 +3021,42 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3157,44 +3093,39 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -4825,14 +4756,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -4971,15 +4896,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5111,15 +5030,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5233,17 +5146,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5291,11 +5199,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5375,16 +5283,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6307,14 +6210,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -6421,15 +6318,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6512,15 +6403,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6585,17 +6470,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6640,9 +6520,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6696,16 +6577,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8392,14 +8268,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -8538,15 +8408,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8678,15 +8542,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8800,17 +8658,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8858,11 +8711,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8942,16 +8795,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index f672c9c6afa22b..d4efdaea3d3497 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -889,15 +882,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -949,48 +937,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1027,44 +1009,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -1783,12 +1760,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1876,15 +1851,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1954,15 +1924,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2014,48 +1979,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2092,44 +2051,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -2848,12 +2802,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2941,15 +2893,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3019,15 +2966,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3079,48 +3021,42 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3157,44 +3093,39 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
-; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -4825,14 +4756,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -4971,15 +4896,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5111,15 +5030,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5233,17 +5146,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5291,11 +5199,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5375,16 +5283,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6307,14 +6210,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -6421,15 +6318,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6512,15 +6403,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6585,17 +6470,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6640,9 +6520,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6696,16 +6577,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8392,14 +8268,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -8538,15 +8408,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8678,15 +8542,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8800,17 +8658,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8858,11 +8711,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8942,16 +8795,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 2165a6ff65e3b5..45624587770420 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,12 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -978,14 +976,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1060,14 +1053,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1126,41 +1114,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -1208,15 +1190,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2244,12 +2221,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2328,14 +2303,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2410,14 +2380,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2476,41 +2441,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -2558,15 +2517,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3594,12 +3548,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3678,14 +3630,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3760,14 +3707,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3826,41 +3768,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -3908,15 +3844,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4440,12 +4371,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4524,14 +4453,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4606,14 +4530,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4672,41 +4591,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -4754,15 +4667,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5789,12 +5697,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5873,14 +5779,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5955,14 +5856,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -6021,41 +5917,35 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
@@ -6103,15 +5993,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7787,14 +7672,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -7924,15 +7803,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8054,15 +7927,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8168,17 +8035,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8216,11 +8078,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8298,16 +8160,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9400,14 +9257,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -9505,15 +9356,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9601,15 +9446,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9681,17 +9520,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9726,9 +9560,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -9779,16 +9614,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10859,14 +10689,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -10964,15 +10788,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11060,15 +10878,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11140,17 +10952,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11185,9 +10992,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -11238,16 +11046,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11800,14 +11603,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -11905,15 +11702,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -12001,15 +11792,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -12081,17 +11866,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -12126,9 +11906,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -12179,16 +11960,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13905,14 +13681,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -14042,15 +13812,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14172,15 +13936,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14286,17 +14044,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14334,11 +14087,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -14416,16 +14169,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index b3acd4949301e1..c1b58f1795aaec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -12,97 +12,204 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
; GFX11-LABEL: set_inactive_chain_arg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_mov_b32_e32 v0, v10
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: global_store_b32 v[8:9], v0, off
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: global_store_b32 v[8:9], v1, off
; GFX11-NEXT: s_endpgm
;
; GFX10-LABEL: set_inactive_chain_arg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-NEXT: v_mov_b32_e32 v0, v10
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: global_store_dword v[8:9], v0, off
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: global_store_dword v[8:9], v1, off
; GFX10-NEXT: s_endpgm
;
; GFX11_W64-LABEL: set_inactive_chain_arg:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT: global_store_b32 v[8:9], v1, off
; GFX11_W64-NEXT: s_endpgm
;
; GFX10_W64-LABEL: set_inactive_chain_arg:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: global_store_dword v[8:9], v0, off
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT: global_store_dword v[8:9], v1, off
; GFX10_W64-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
- store i32 %tmp, ptr addrspace(1) %out
+ %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp)
+ store i32 %wwm, ptr addrspace(1) %out
ret void
}
define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
-; GFX11-LABEL: set_inactive_chain_arg_64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v12
-; GFX11-NEXT: v_mov_b32_e32 v1, v13
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: v_mov_b32_e32 v0, v10
-; GFX11-NEXT: v_mov_b32_e32 v1, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT: s_endpgm
+; GISEL11-LABEL: set_inactive_chain_arg_64:
+; GISEL11: ; %bb.0:
+; GISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
+; GISEL11-NEXT: v_mov_b32_e32 v2, v0
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL11-NEXT: v_mov_b32_e32 v3, v1
+; GISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off
+; GISEL11-NEXT: s_endpgm
;
-; GFX10-LABEL: set_inactive_chain_arg_64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v12
-; GFX10-NEXT: v_mov_b32_e32 v1, v13
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v0, v10
-; GFX10-NEXT: v_mov_b32_e32 v1, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10-NEXT: s_endpgm
+; DAGISEL11-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11: ; %bb.0:
+; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT: s_endpgm
;
-; GFX11_W64-LABEL: set_inactive_chain_arg_64:
-; GFX11_W64: ; %bb.0:
-; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v12
-; GFX11_W64-NEXT: v_mov_b32_e32 v1, v13
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT: s_endpgm
+; GISEL10-LABEL: set_inactive_chain_arg_64:
+; GISEL10: ; %bb.0:
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_mov_b32_e32 v0, v10
+; GISEL10-NEXT: v_mov_b32_e32 v1, v11
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
+; GISEL10-NEXT: v_mov_b32_e32 v2, v0
+; GISEL10-NEXT: v_mov_b32_e32 v3, v1
+; GISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10-NEXT: s_endpgm
;
-; GFX10_W64-LABEL: set_inactive_chain_arg_64:
-; GFX10_W64: ; %bb.0:
-; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v12
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, v13
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10_W64-NEXT: s_endpgm
+; DAGISEL10-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10: ; %bb.0:
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT: s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL11_W64: ; %bb.0:
+; GISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v10
+; GISEL11_W64-NEXT: v_mov_b32_e32 v1, v11
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL11_W64-NEXT: v_mov_b32_e32 v3, v1
+; GISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off
+; GISEL11_W64-NEXT: s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11_W64: ; %bb.0:
+; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT: s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL10_W64: ; %bb.0:
+; GISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v10
+; GISEL10_W64-NEXT: v_mov_b32_e32 v1, v11
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL10_W64-NEXT: v_mov_b32_e32 v3, v1
+; GISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10_W64-NEXT: s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10_W64: ; %bb.0:
+; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
- store i64 %tmp, ptr addrspace(1) %out
+ %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)
+ store i64 %wwm, ptr addrspace(1) %out
ret void
}
@@ -113,16 +220,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_mov_b32_e32 v0, v10
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v0
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: global_store_b32 v[8:9], v2, off
; GFX11-NEXT: s_endpgm
@@ -133,11 +237,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX10-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-NEXT: v_mov_b32_e32 v0, v10
; GFX10-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v0, v0
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX10-NEXT: s_mov_b32 exec_lo, s0
@@ -151,17 +252,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v0
-; GFX11_W64-NEXT: s_not_b64 exec, exec
; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1
; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off
; GFX11_W64-NEXT: s_endpgm
@@ -172,11 +269,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v0
-; GFX10_W64-NEXT: s_not_b64 exec, exec
; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
@@ -214,11 +308,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11-NEXT: v_mov_b32_e32 v11, 0
; GISEL11-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11-NEXT: s_endpgm
@@ -244,11 +337,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11-NEXT: s_endpgm
@@ -283,10 +375,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
; GISEL10-NEXT: v_mov_b32_e32 v0, v12
; GISEL10-NEXT: global_store_dword v[41:42], v0, off
; GISEL10-NEXT: s_endpgm
@@ -321,10 +412,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10-NEXT: s_endpgm
@@ -357,11 +447,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11_W64-NEXT: s_endpgm
@@ -394,11 +483,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11_W64-NEXT: s_endpgm
@@ -433,10 +521,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; GISEL10_W64-NEXT: s_endpgm
@@ -471,10 +558,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10_W64-NEXT: s_endpgm
@@ -511,11 +597,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
; GISEL11-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11-NEXT: s_endpgm
@@ -541,11 +626,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11-NEXT: s_endpgm
@@ -580,10 +664,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
; GISEL10-NEXT: v_mov_b32_e32 v0, v12
; GISEL10-NEXT: global_store_dword v[41:42], v0, off
; GISEL10-NEXT: s_endpgm
@@ -618,10 +701,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10-NEXT: s_endpgm
@@ -654,11 +736,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11_W64-NEXT: s_endpgm
@@ -691,11 +772,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11_W64-NEXT: s_endpgm
@@ -730,10 +810,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; GISEL10_W64-NEXT: s_endpgm
@@ -768,10 +847,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10_W64-NEXT: s_endpgm
@@ -786,6 +864,7 @@ declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
declare amdgpu_gfx void @gfx_callee(<12 x i32>)
attributes #0 = { convergent readnone willreturn nocallback nofree}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 114d2d099ab7b1..6dc4a2ce0504b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,18 +5,22 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -25,13 +29,15 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -40,20 +46,25 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -63,13 +74,16 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -82,12 +96,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, 56
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b64 s[2:3], -1
; GCN-NEXT: s_cbranch_scc1 .LBB4_3
; GCN-NEXT: ; %bb.1: ; %Flow
@@ -96,19 +113,20 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: .LBB4_2: ; %.exit
; GCN-NEXT: s_endpgm
; GCN-NEXT: .LBB4_3: ; %.one
-; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-NEXT: s_cbranch_execnz .LBB4_2
; GCN-NEXT: .LBB4_4: ; %.zero
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
br i1 %cmp, label %.zero, label %.one
.zero:
@@ -127,19 +145,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x40400000
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
store float %tmp, ptr addrspace(1) %out
ret void
}
@@ -148,22 +170,27 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
store double %tmp, ptr addrspace(1) %out
ret void
}
@@ -171,19 +198,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x10001
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
store <2 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -191,19 +222,23 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x3c003c00
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
store <2 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -212,22 +247,27 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 1
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 1
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
store <2 x i32> %tmp, ptr addrspace(1) %out
ret void
}
@@ -236,22 +276,27 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 1.0
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 1.0
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
store <2 x float> %tmp, ptr addrspace(1) %out
ret void
}
@@ -259,19 +304,23 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x3f803f80
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
store <2 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -280,22 +329,27 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x10001
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
store <4 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -304,22 +358,27 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
store <4 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -328,22 +387,27 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
store <4 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -352,20 +416,25 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
store ptr %tmp, ptr addrspace(1) %out
ret void
}
@@ -373,18 +442,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
ret void
}
@@ -392,18 +465,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
ret void
}
@@ -411,18 +488,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
ret void
}
@@ -430,24 +511,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 81858bd3d29ee0..f60786c1bacbff 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.3: ; %.then
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
@@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs
; GCN-NEXT: .LBB1_5: ; %.else
; GCN-NEXT: s_or_saveexec_b32 s1, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: v_mov_b32_e32 v2, v3
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1
; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 09e342fe190666..90b32e29e98f67 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i
; GCN-NEXT: s_cbranch_execz .LBB0_1
; GCN-NEXT: ; %bb.3: ; %bb1
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT: v_mov_b32_e32 v3, s4
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: s_or_saveexec_b32 s9, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s9
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3a81771a2790c..ff692acda3c255 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1674,13 +1674,13 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
; GFX1032-NEXT: s_clause 0x1
; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v0, 42
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s4, s2
+; GFX1032-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: global_store_dword v1, v2, s[0:1]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive:
@@ -1688,15 +1688,16 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v0, 42
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s4, s[2:3]
+; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: global_store_dword v1, v2, s[0:1]
; GFX1064-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -1705,31 +1706,32 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
; GFX1032-LABEL: test_set_inactive_64:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s6
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive_64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064-NEXT: v_mov_b32_e32 v1, s7
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX1064-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -2921,6 +2923,8 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare float @llvm.amdgcn.strict.wwm.f32(float)
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
declare float @llvm.amdgcn.wwm.f32(float)
declare i32 @llvm.amdgcn.wqm.i32(i32)
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 6b4c2da772cdc2..ab84c0c905771b 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
@@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
@@ -1317,7 +1311,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
-; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
@@ -1334,7 +1328,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
@@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
@@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
@@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ef6d0780f395fd..534865173d9a59 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -282,10 +282,10 @@ body: |
#
#CHECK-NOT: ENTER_STRICT_WWM
#CHECK: BUFFER_LOAD_DWORDX2
-#CHECK-NOT: ENTER_STRICT_WWM
+#CHECK: ENTER_STRICT_WWM
#CHECK: V_SET_INACTIVE_B32
#CHECK: V_SET_INACTIVE_B32
-#CHECK: ENTER_STRICT_WWM
+#CHECK-NOT: ENTER_STRICT_WWM
#CHECK: V_MAX
name: test_wwm_set_inactive_propagation
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd7760..47e1897f6b420a 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -80,17 +80,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[34:35]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -177,11 +170,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -208,12 +201,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -270,34 +259,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB1_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -378,26 +358,26 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b32 s42, s6
; GFX9-O0-NEXT: s_mov_b32 s34, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
-; GFX9-O0-NEXT: s_mov_b32 s42, s41
-; GFX9-O0-NEXT: s_mov_b32 s43, s40
+; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT: s_mov_b32 s43, s7
+; GFX9-O0-NEXT: s_mov_b32 s44, s43
+; GFX9-O0-NEXT: s_mov_b32 s45, s42
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s5
-; GFX9-O0-NEXT: s_mov_b32 s44, s35
+; GFX9-O0-NEXT: s_mov_b32 s46, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s37, s44
-; GFX9-O0-NEXT: s_mov_b32 s38, s43
-; GFX9-O0-NEXT: s_mov_b32 s39, s42
+; GFX9-O0-NEXT: s_mov_b32 s37, s46
+; GFX9-O0-NEXT: s_mov_b32 s38, s45
+; GFX9-O0-NEXT: s_mov_b32 s39, s44
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_getpc_b64 s[42:43]
; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called at rel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called at rel32@hi+12
@@ -437,11 +417,11 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
@@ -559,7 +539,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-LABEL: strict_wwm_call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s48, s33
+; GFX9-O0-NEXT: s_mov_b32 s50, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -583,41 +563,41 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0
+; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1
; GFX9-O0-NEXT: s_mov_b32 s34, s8
-; GFX9-O0-NEXT: s_mov_b32 s38, s6
+; GFX9-O0-NEXT: s_mov_b32 s40, s6
; GFX9-O0-NEXT: s_mov_b32 s36, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s39, s7
-; GFX9-O0-NEXT: s_mov_b32 s35, s39
-; GFX9-O0-NEXT: s_mov_b32 s44, s38
+; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s35, s41
+; GFX9-O0-NEXT: s_mov_b32 s42, s40
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
; GFX9-O0-NEXT: s_mov_b32 s37, s5
-; GFX9-O0-NEXT: s_mov_b32 s45, s37
-; GFX9-O0-NEXT: s_mov_b32 s40, s36
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
-; GFX9-O0-NEXT: s_mov_b32 s41, s45
-; GFX9-O0-NEXT: s_mov_b32 s42, s44
-; GFX9-O0-NEXT: s_mov_b32 s43, s35
-; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT: s_mov_b32 s43, s37
+; GFX9-O0-NEXT: s_mov_b32 s44, s36
+; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
+; GFX9-O0-NEXT: s_mov_b32 s45, s43
+; GFX9-O0-NEXT: s_mov_b32 s46, s42
+; GFX9-O0-NEXT: s_mov_b32 s47, s35
+; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2
+; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3
+; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s9
; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: s_mov_b32 s34, 32
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
@@ -634,20 +614,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0
+; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1
+; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2
+; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3
+; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4
+; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
@@ -679,14 +659,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT: s_mov_b32 s33, s48
+; GFX9-O0-NEXT: s_mov_b32 s33, s50
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O3-LABEL: strict_wwm_call_i64:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT: s_mov_b32 s40, s33
+; GFX9-O3-NEXT: s_mov_b32 s38, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -702,28 +682,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: s_getpc_b64 s[34:35]
+; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
@@ -739,7 +717,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT: s_mov_b32 s33, s40
+; GFX9-O3-NEXT: s_mov_b32 s33, s38
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -778,16 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s40, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s35
+; GFX9-O0-NEXT: s_mov_b32 s42, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT: s_mov_b32 s43, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -796,21 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -851,28 +835,30 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s34, -1
-; GFX9-O3-NEXT: s_brev_b32 s35, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: s_mov_b32 s36, -1
+; GFX9-O3-NEXT: s_brev_b32 s37, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
@@ -922,21 +908,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -987,130 +961,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25
-; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26
-; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27
-; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23
+; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26
+; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46
-; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45
-; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44
-; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43
-; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT: s_waitcnt vmcnt(3)
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35
+; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec
; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1150,62 +1104,82 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5
@@ -1245,16 +1219,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -1265,73 +1231,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32
; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4
; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8
; GFX9-O3-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16
; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX9-O3-NEXT: v_mov_b32_e32 v32, v1
-; GFX9-O3-NEXT: v_mov_b32_e32 v33, v2
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v32, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v33, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v34, v3
-; GFX9-O3-NEXT: v_mov_b32_e32 v35, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v34, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v35, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v36, v5
-; GFX9-O3-NEXT: v_mov_b32_e32 v37, v6
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v36, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v37, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v38, v7
-; GFX9-O3-NEXT: v_mov_b32_e32 v39, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9
-; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4
-; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:12
-; GFX9-O3-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:8
-; GFX9-O3-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:20
-; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28
-; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
-; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
-; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33
+; GFX9-O3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX9-O3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX9-O3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX9-O3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX9-O3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX9-O3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX9-O3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX9-O3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6
@@ -1359,24 +1308,21 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28
; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
- %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
- %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
- %c2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
- %d2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
- %e2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+ %a2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
+ %a2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %a2.i)
+ %b2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
+ %b2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %b2.i)
+ %c2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
+ %c2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %c2.i)
+ %d2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
+ %d2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %d2.i)
+ %e2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+ %e2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %e2.i)
store i64 %a2, ptr addrspace(5) %ptr
%ptr_b = getelementptr i64, ptr addrspace(5) %ptr, i32 1
store i64 %b2, ptr addrspace(5) %ptr_b
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index a74dbe1de0d39e..7f0db3e362b308 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -63,17 +63,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -154,11 +147,11 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -185,12 +178,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
@@ -236,34 +225,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB1_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -361,35 +341,35 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s7
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s9
-; GFX9-O0-NEXT: s_mov_b32 s16, s8
+; GFX9-O0-NEXT: s_mov_b32 s3, s9
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s17
+; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s7
-; GFX9-O0-NEXT: s_mov_b32 s18, s6
+; GFX9-O0-NEXT: s_mov_b32 s17, s9
+; GFX9-O0-NEXT: s_mov_b32 s18, s8
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9
; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -418,13 +398,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -454,12 +434,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
@@ -613,35 +593,35 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: s_mov_b32 s7, s8
-; GFX9-O0-NEXT: s_mov_b32 s8, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s19
+; GFX9-O0-NEXT: s_mov_b32 s7, s18
+; GFX9-O0-NEXT: s_mov_b32 s15, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s15
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -678,12 +658,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -721,14 +701,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -792,16 +772,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s6, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -810,21 +792,25 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -848,28 +834,30 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s4, -1
-; GFX9-O3-NEXT: s_brev_b32 s5, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: s_mov_b32 s6, -1
+; GFX9-O3-NEXT: s_brev_b32 s7, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
@@ -927,15 +915,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -964,17 +952,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -1055,11 +1036,11 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1086,12 +1067,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
@@ -1137,34 +1114,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB8_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1262,35 +1230,35 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s7
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s9
-; GFX9-O0-NEXT: s_mov_b32 s16, s8
+; GFX9-O0-NEXT: s_mov_b32 s3, s9
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s17
+; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s7
-; GFX9-O0-NEXT: s_mov_b32 s18, s6
+; GFX9-O0-NEXT: s_mov_b32 s17, s9
+; GFX9-O0-NEXT: s_mov_b32 s18, s8
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9
; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1319,13 +1287,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1355,12 +1323,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
@@ -1514,35 +1482,35 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: s_mov_b32 s7, s8
-; GFX9-O0-NEXT: s_mov_b32 s8, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s19
+; GFX9-O0-NEXT: s_mov_b32 s7, s18
+; GFX9-O0-NEXT: s_mov_b32 s15, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s15
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1579,12 +1547,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1622,14 +1590,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -1693,16 +1661,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s6, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -1711,21 +1681,25 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -1749,28 +1723,30 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s4, -1
-; GFX9-O3-NEXT: s_brev_b32 s5, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: s_mov_b32 s6, -1
+; GFX9-O3-NEXT: s_brev_b32 s7, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index b3ed7376a1ede6..f73489b7db77cf 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -273,12 +273,15 @@ define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg
%ld1 = load volatile i32, ptr addrspace(1) %ptr
%inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0)
%inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0)
- store volatile i32 %inactive0, ptr addrspace(1) %ptr
- store volatile i32 %inactive1, ptr addrspace(1) %ptr
+ %wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0)
+ %wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1)
+ store volatile i32 %wwm0, ptr addrspace(1) %ptr
+ store volatile i32 %wwm1, ptr addrspace(1) %ptr
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6
attributes #0 = { "no-signed-zeros-fp-math" = "true" }
attributes #1 = { "amdgpu-dx10-clamp" = "false" }
>From dfbf21c11c979faa529e1a23f042de4a10a2b7fd Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 29 Aug 2024 14:05:23 +0900
Subject: [PATCH 2/2] - Address reviewer comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6485b188e8422a..7129ef94251d24 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2280,7 +2280,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32;
- Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ Register ExecReg = RI.getExec();
Register DstReg = MI.getOperand(0).getReg();
MachineOperand &ActiveSrc = MI.getOperand(1);
MachineOperand &InactiveSrc = MI.getOperand(2);
@@ -2307,7 +2307,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// present an issue.
// Fallback to V_MOV base lowering in all but the common cases.
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MBB.getParent();
const MachineRegisterInfo &MRI = MF->getRegInfo();
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
const MCInstrDesc &Desc = get(Opcode);
More information about the llvm-commits
mailing list