[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 4 19:32:42 PDT 2024
https://github.com/perlfu updated https://github.com/llvm/llvm-project/pull/98864
>From 717735a277840f3291d61c99c6c0185d98f122ed Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Sun, 14 Jul 2024 17:31:13 +0900
Subject: [PATCH 1/5] [AMDGPU] V_SET_INACTIVE optimizations
Optimize V_SET_INACTIVE by always running it in run in WWM.
Allows WWM sections to be unbroken, and facilitates V_SET_INACTIVE
to be be lower to V_CNDMASK in most cases.
Some cases require use of exec manipulation V_MOV as previous code.
GFX9 sees slight instruction count increase in edge cases due to
smaller constant bus.
Additionally:
- Avoid introducing exec manipulation and V_MOVs where
a source of V_SET_INACTIVE is the destination.
- Lower any V_SET_INACTIVE not touched by marking to COPY.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 183 +-
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 68 +-
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 401 ++--
.../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll | 30 +-
.../AMDGPU/amdgpu-cs-chain-preserve-cc.ll | 29 +-
.../atomic_optimizations_global_pointer.ll | 536 ++---
.../atomic_optimizations_local_pointer.ll | 1913 ++++++-----------
.../atomic_optimizations_pixelshader.ll | 64 +-
llvm/test/CodeGen/AMDGPU/cse-convergent.ll | 14 +-
llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll | 18 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 607 ++----
.../AMDGPU/global_atomics_scan_fmax.ll | 422 ++--
.../AMDGPU/global_atomics_scan_fmin.ll | 422 ++--
.../AMDGPU/global_atomics_scan_fsub.ll | 607 ++----
.../llvm.amdgcn.set.inactive.chain.arg.ll | 389 ++--
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 382 ++--
.../AMDGPU/set-inactive-wwm-overwrite.ll | 12 +-
.../AMDGPU/should-not-hoist-set-inactive.ll | 5 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 64 +-
llvm/test/CodeGen/AMDGPU/wqm.ll | 52 +-
llvm/test/CodeGen/AMDGPU/wqm.mir | 4 +-
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 704 +++---
llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 504 +++--
.../MIR/AMDGPU/machine-function-info.ll | 7 +-
24 files changed, 3167 insertions(+), 4270 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a857bdba53c3e8..6485b188e8422a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ // Find implicit register defining lanes active outside WWM.
+ // Note: default here is set to ExecReg so that functional MIR is still
+ // generated if implicit def is not found and assertions are disabled.
+ Register ExecSrcReg = ExecReg;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+ assert(ExecSrcReg != ExecReg &&
+ "V_SET_INACTIVE must be in known WWM region");
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+
+ const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+ const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+ const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+ const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+ const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+ const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses =
+ 1 + // Starts at 1 for ExecSrcReg
+ (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+ (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+ int LiteralConstants =
+ (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
+ (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+
+ bool UseVCndMask =
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+ if (VMov64 && UseVCndMask) {
+ // Decomposition must not introduce new literals.
+ UseVCndMask &=
+ ActiveSrc.isReg() ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+ (!isInlineConstant(ActiveImm));
+ UseVCndMask &= InactiveSrc.isReg() ||
+ (isInlineConstant(InactiveImmLo) &&
+ isInlineConstant(InactiveImmLo)) ||
+ (!isInlineConstant(InactiveImm));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // Dual V_CNDMASK_B32
+ MachineOperand ActiveLo =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
+ MachineOperand ActiveHi =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
+ : MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
+ MachineOperand InactiveLo =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
+ MachineOperand InactiveHi =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
+ : MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
+ .addImm(0)
+ .add(InactiveLo)
+ .addImm(0)
+ .add(ActiveLo)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
+ .addImm(0)
+ .add(InactiveHi)
+ .addImm(0)
+ .add(ActiveHi)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ } else if (UseVCndMask) {
+ // Single V_CNDMASK_B32
+ BuildMI(MBB, MI, DL, get(Opcode), DstReg)
+ .addImm(0)
+ .add(InactiveSrc)
+ .addImm(0)
+ .add(ActiveSrc)
+ .addReg(ExecSrcReg);
+ } else {
+ // Fallback V_MOV case.
+ // Avoid unnecessary work if a source VGPR is also the destination.
+ // This can happen if WWM register allocation was efficient.
+ // Note: this assumes WWM execution.
+ bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+ bool DstIsInactive =
+ InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+ if (!DstIsInactive) {
+ // Set exec mask to inactive lanes,
+ // but only if active lanes would be overwritten.
+ if (DstIsActive) {
+ MachineInstr *ExecMI =
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
+ ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ }
+ // Copy inactive lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ if (!DstIsActive) {
+ // Set exec mask to active lanes
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
+ // Copy active lanes
+ MachineInstr *VMov =
+ BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
+ .add(ActiveSrc);
+ if (VMov64)
+ expandPostRAPseudo(*VMov);
+ }
+ // Restore WWM
+ BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
+ }
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 9a51cbbb9f6b8e..fe2b40db1d4ea8 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -128,6 +128,7 @@ struct InstrInfo {
char Needs = 0;
char Disabled = 0;
char OutNeeds = 0;
+ char MarkedStates = 0;
};
struct BlockInfo {
@@ -175,9 +176,10 @@ class SIWholeQuadMode : public MachineFunctionPass {
SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
- SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+ SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;
+ SmallVector<MachineInstr *, 4> SetInactiveInstrs;
void printInfo();
@@ -295,6 +297,9 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
+ // Capture all states requested in marking including disabled ones.
+ II.MarkedStates |= Flag;
+
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -478,7 +483,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
- SmallVector<MachineInstr *, 4> SetInactiveInstrs;
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
bool HasImplicitDerivatives =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
@@ -512,9 +516,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.
Flags = StateWQM;
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
} else if (Opcode == AMDGPU::SOFT_WQM) {
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
SoftWQMInstrs.push_back(&MI);
} else if (Opcode == AMDGPU::STRICT_WWM) {
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
@@ -555,16 +559,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+ // Disable strict states; StrictWQM will be added as required later.
III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(2);
if (Inactive.isReg()) {
if (Inactive.isUndef()) {
- LowerToCopyInstrs.push_back(&MI);
+ LowerToCopyInstrs.insert(&MI);
} else {
markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
}
SetInactiveInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1042,6 +1048,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
SmallVector<MachineInstr *, 4> SplitPoints;
+ Register ActiveLanesReg = 0;
char State = BI.InitialState;
for (MachineInstr &MI : llvm::make_early_inc_range(
@@ -1058,6 +1065,20 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MBB, MI);
break;
+ case AMDGPU::ENTER_STRICT_WWM:
+ ActiveLanesReg = MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::EXIT_STRICT_WWM:
+ ActiveLanesReg = 0;
+ break;
+ case AMDGPU::V_SET_INACTIVE_B32:
+ case AMDGPU::V_SET_INACTIVE_B64:
+ if (ActiveLanesReg) {
+ MI.addOperand(*MBB.getParent(),
+ MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ } else
+ assert(State == StateExact || State == StateWQM);
+ break;
default:
break;
}
@@ -1497,13 +1518,14 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
+ LLVM_DEBUG(dbgs() << "simplify: " << *MI);
+
+ Register RecomputeReg = 0;
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
assert(MI->getNumExplicitOperands() == 3);
- // the only reason we should be here is V_SET_INACTIVE has
- // an undef input so it is being replaced by a simple copy.
- // There should be a second undef source that we should remove.
- assert(MI->getOperand(2).isUndef());
+ if (MI->getOperand(2).isReg())
+ RecomputeReg = MI->getOperand(2).getReg();
MI->removeOperand(2);
MI->untieRegOperand(1);
} else {
@@ -1514,7 +1536,19 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
? (unsigned)AMDGPU::COPY
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
*MRI, MI->getOperand(0)));
+ int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+ while (Index >= 0) {
+ MI->removeOperand(Index);
+ Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
+ }
+
MI->setDesc(TII->get(CopyOp));
+ LLVM_DEBUG(dbgs() << " -> " << *MI);
+
+ if (RecomputeReg) {
+ LIS->removeInterval(RecomputeReg);
+ LIS->createAndComputeVirtRegInterval(RecomputeReg);
+ }
}
return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
}
@@ -1656,6 +1690,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
+ SetInactiveInstrs.clear();
StateTransition.clear();
ST = &MF.getSubtarget<GCNSubtarget>();
@@ -1712,6 +1747,21 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
}
+ // Check if V_SET_INACTIVE was touched by a strict state mode.
+ // If so, promote to WWM; otherwise lower to COPY.
+ for (MachineInstr *MI : SetInactiveInstrs) {
+ if (LowerToCopyInstrs.contains(MI))
+ continue;
+ if (Instructions[MI].MarkedStates & StateStrict) {
+ Instructions[MI].Needs |= StateStrictWWM;
+ Instructions[MI].Disabled &= ~StateStrictWWM;
+ Blocks[MI->getParent()].Needs |= StateStrictWWM;
+ } else {
+ LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
+ LowerToCopyInstrs.insert(MI);
+ }
+ }
+
LLVM_DEBUG(printInfo());
Changed |= lowerLiveMaskQueries();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 8f88aaedf7e95e..137366a45cbdfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,18 +4,39 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
+ store i32 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -24,18 +45,42 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT: s_endpgm
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
+ store i64 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
+; GCN-LABEL: set_inactive_imm_poison_64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -45,39 +90,43 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
-; GCN-NEXT: s_load_dword s5, s[2:3], 0x2c
+; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0
+; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s4, 56
+; GCN-NEXT: s_cmp_lg_u32 s6, 56
; GCN-NEXT: s_cselect_b32 s3, 1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, 1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_cmp_lg_u32 s3, 0
-; GCN-NEXT: s_cbranch_scc0 .LBB2_2
+; GCN-NEXT: s_cbranch_scc0 .LBB4_2
; GCN-NEXT: ; %bb.1: ; %.one
-; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s2, 0
-; GCN-NEXT: .LBB2_2: ; %Flow
+; GCN-NEXT: .LBB4_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cbranch_scc1 .LBB2_4
+; GCN-NEXT: s_cbranch_scc1 .LBB4_4
; GCN-NEXT: ; %bb.3: ; %.zero
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-NEXT: .LBB2_4: ; %.exit
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: .LBB4_4: ; %.exit
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
br i1 %cmp, label %.zero, label %.one
.zero:
@@ -96,19 +145,22 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
store float %tmp, ptr addrspace(1) %out
ret void
}
@@ -117,20 +169,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v3, 0x4010cccc
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
store double %tmp, ptr addrspace(1) %out
ret void
}
@@ -138,19 +193,22 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
store <2 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -158,19 +216,22 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
store <2 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -179,22 +240,25 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
store <2 x i32> %tmp, ptr addrspace(1) %out
ret void
}
@@ -203,22 +267,25 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 1.0
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 1.0
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
store <2 x float> %tmp, ptr addrspace(1) %out
ret void
}
@@ -226,19 +293,22 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v1
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
store <2 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -247,22 +317,25 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x10001
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x10001
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
store <4 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -271,22 +344,25 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
store <4 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -295,22 +371,25 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s5, s4
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s6, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s7, s6
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, v2
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
store <4 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -319,18 +398,23 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
store ptr %tmp, ptr addrspace(1) %out
ret void
}
@@ -338,18 +422,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
ret void
}
@@ -357,18 +445,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
ret void
}
@@ -376,18 +468,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
ret void
}
@@ -395,24 +491,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index c92b78cd45573a..e34ae52fc673ab 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -571,11 +571,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1
; GISEL-GFX11-NEXT: ;;#ASMSTART
; GISEL-GFX11-NEXT: s_nop
@@ -591,10 +590,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; GISEL-GFX10: ; %bb.0:
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
@@ -609,11 +607,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; DAGISEL-GFX11-LABEL: chain_to_chain_wwm:
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v2, v1
; DAGISEL-GFX11-NEXT: ;;#ASMSTART
@@ -629,11 +626,10 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
; DAGISEL-GFX10-LABEL: chain_to_chain_wwm:
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v2, v1
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 8d9ed9bb4343c6..320268564f4dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -329,10 +329,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill
; GISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX11-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX11-NEXT: ;;#ASMSTART
; GISEL-GFX11-NEXT: s_nop
; GISEL-GFX11-NEXT: ;;#ASMEND
@@ -351,10 +351,9 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
; GISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; GISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL-GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s0
+; GISEL-GFX10-NEXT: s_mov_b32 exec_lo, s0
; GISEL-GFX10-NEXT: ;;#ASMSTART
; GISEL-GFX10-NEXT: s_nop
; GISEL-GFX10-NEXT: ;;#ASMEND
@@ -371,11 +370,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v16, off ; 4-byte Folded Spill
+; DAGISEL-GFX11-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX11-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX11-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX11-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX11-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX11-NEXT: ;;#ASMSTART
; DAGISEL-GFX11-NEXT: s_nop
; DAGISEL-GFX11-NEXT: ;;#ASMEND
@@ -393,11 +391,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
; DAGISEL-GFX10: ; %bb.0:
; DAGISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX10-NEXT: buffer_store_dword v16, off, s[48:51], 0 ; 4-byte Folded Spill
+; DAGISEL-GFX10-NEXT: s_or_saveexec_b32 s4, -1
; DAGISEL-GFX10-NEXT: s_mov_b32 s3, s0
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 3
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL-GFX10-NEXT: v_mov_b32_e32 v1, 4
-; DAGISEL-GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL-GFX10-NEXT: v_cndmask_b32_e64 v1, 4, 3, s4
+; DAGISEL-GFX10-NEXT: s_mov_b32 exec_lo, s4
; DAGISEL-GFX10-NEXT: ;;#ASMSTART
; DAGISEL-GFX10-NEXT: s_nop
; DAGISEL-GFX10-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index cc7050d08541a0..5a8df7b84bf2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1147,11 +1147,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1200,11 +1198,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1247,13 +1243,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: add_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1310,11 +1303,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: add_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1364,27 +1354,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-LABEL: add_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -1438,35 +1425,33 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-LABEL: add_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -1500,27 +1485,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-LABEL: add_i32_varying:
; GFX1264_DPP: ; %bb.0: ; %entry
; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -1578,33 +1560,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-LABEL: add_i32_varying:
; GFX1232_DPP: ; %bb.0: ; %entry
; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
@@ -2918,15 +2897,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3018,15 +2991,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3115,47 +3082,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
@@ -3228,40 +3189,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -3323,21 +3278,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -3441,22 +3390,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3537,56 +3481,50 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
@@ -3658,35 +3596,29 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4894,11 +4826,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4947,11 +4877,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4994,13 +4922,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: sub_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5057,11 +4982,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: sub_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5111,27 +5033,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-LABEL: sub_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -5185,35 +5104,33 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-LABEL: sub_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v1, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
@@ -5247,27 +5164,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-LABEL: sub_i32_varying:
; GFX1264_DPP: ; %bb.0: ; %entry
; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s6, v1, 15
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
@@ -5325,33 +5239,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-LABEL: sub_i32_varying:
; GFX1232_DPP: ; %bb.0: ; %entry
; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_readlane_b32 s6, v1, 31
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
@@ -6707,15 +6618,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6807,15 +6712,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6904,47 +6803,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
@@ -7017,40 +6910,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -7112,21 +6999,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -7230,22 +7111,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7326,56 +7202,50 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1264_DPP-NEXT: s_not_b64 exec, exec
-; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1264_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1264_DPP-NEXT: v_readlane_b32 s5, v4, 31
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1264_DPP-NEXT: v_mov_b32_e32 v7, s5
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v7, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_readlane_b32 s4, v3, 31
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, s4
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1264_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
@@ -7447,35 +7317,29 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1232_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
+; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1232_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6d0e0cc7869b3f..6bf03a202c1434 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -780,14 +780,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -827,14 +825,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -870,13 +866,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: add_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -925,13 +918,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: add_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -971,33 +961,30 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: add_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -1037,27 +1024,24 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: add_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -1315,11 +1299,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1350,11 +1332,9 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1381,11 +1361,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1064_DPP-LABEL: add_i32_varying_nouse:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1414,11 +1391,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
;
; GFX1032_DPP-LABEL: add_i32_varying_nouse:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -1442,34 +1416,32 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_DPP-LABEL: add_i32_varying_nouse:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164_DPP-NEXT: ; %bb.1:
@@ -1482,26 +1454,24 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_DPP-LABEL: add_i32_varying_nouse:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132_DPP-NEXT: ; %bb.1:
@@ -2398,15 +2368,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2493,15 +2457,9 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2585,47 +2543,41 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
@@ -2689,43 +2641,37 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -2734,10 +2680,10 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -2779,21 +2725,15 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -2891,23 +2831,18 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -3235,15 +3170,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3312,15 +3241,9 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3386,23 +3309,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3451,22 +3368,18 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3507,21 +3420,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v6
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v7
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
@@ -3575,22 +3482,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v6
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v7
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4400,14 +4302,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4447,14 +4347,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4490,13 +4388,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: sub_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4545,13 +4440,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: sub_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4591,33 +4483,30 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: sub_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -4657,27 +4546,24 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: sub_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -4935,11 +4821,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -4970,11 +4854,9 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5001,11 +4883,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5034,11 +4913,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
;
; GFX1032_DPP-LABEL: sub_i32_varying_nouse:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -5062,34 +4938,32 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_DPP-LABEL: sub_i32_varying_nouse:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1164_DPP-NEXT: ; %bb.1:
@@ -5102,26 +4976,24 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_DPP-LABEL: sub_i32_varying_nouse:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB11_2
; GFX1132_DPP-NEXT: ; %bb.1:
@@ -6044,15 +5916,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6139,15 +6005,9 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -6231,47 +6091,41 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v8, vcc
; GFX1064_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1064_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v7, vcc
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v3, 31
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v2
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v4, 31
@@ -6335,43 +6189,37 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v8, vcc_lo
; GFX1032_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
; GFX1032_DPP-NEXT: v_permlanex16_b32 v8, v4, -1, -1
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
@@ -6380,10 +6228,10 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v7, vcc_lo
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v4, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v3, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -6425,21 +6273,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
@@ -6537,23 +6379,18 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6943,13 +6780,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -6992,13 +6825,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7037,13 +6866,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: and_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7092,13 +6918,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: and_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7138,33 +6961,30 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: and_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -7204,31 +7024,29 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: and_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -7619,16 +7437,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
@@ -7683,16 +7495,10 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
@@ -7741,19 +7547,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: and_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7822,19 +7624,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: and_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
@@ -7846,11 +7644,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -7885,47 +7683,43 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: and_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7981,43 +7775,39 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: and_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
@@ -8375,14 +8165,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8422,14 +8210,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8465,13 +8251,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: or_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8520,13 +8303,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: or_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8566,33 +8346,30 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: or_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -8632,27 +8409,24 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: or_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -9047,16 +8821,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
@@ -9111,16 +8879,10 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
@@ -9169,19 +8931,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: or_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9250,19 +9008,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: or_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9274,11 +9028,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -9313,47 +9067,43 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: or_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -9409,43 +9159,39 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: or_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
@@ -9803,14 +9549,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9850,14 +9594,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9893,13 +9635,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: xor_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9948,13 +9687,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: xor_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9994,33 +9730,30 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: xor_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -10060,27 +9793,24 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: xor_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -10475,16 +10205,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
@@ -10539,16 +10263,10 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v6
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
@@ -10597,19 +10315,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: xor_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10678,19 +10392,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: xor_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10702,11 +10412,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
@@ -10741,47 +10451,43 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: xor_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
+; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -10837,43 +10543,39 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: xor_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
@@ -11232,12 +10934,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11281,12 +10982,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11325,13 +11025,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: max_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11380,13 +11077,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: max_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0
; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -11426,33 +11120,30 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: max_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1]
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -11492,31 +11183,29 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: max_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0
; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -12195,19 +11884,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: max_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: s_mov_b32 s0, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX8_DPP-NEXT: s_brev_b32 s1, 1
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -12294,19 +11983,19 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: max_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: s_mov_b32 s0, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9_DPP-NEXT: s_brev_b32 s1, 1
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -12393,20 +12082,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-LABEL: max_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: s_mov_b32 s0, 0
-; GFX1064_DPP-NEXT: s_brev_b32 s1, 1
+; GFX1064_DPP-NEXT: s_mov_b32 s4, 0
+; GFX1064_DPP-NEXT: s_brev_b32 s5, 1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -12515,20 +12198,14 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
@@ -12610,77 +12287,70 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12744,55 +12414,48 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
@@ -13158,12 +12821,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13207,12 +12869,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13251,13 +12912,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: min_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
; GFX1064_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13306,13 +12964,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: min_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
; GFX1032_DPP-NEXT: v_bfrev_b32_e32 v3, -2
+; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -13352,33 +13007,30 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: min_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1]
; GFX1164_DPP-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -13418,31 +13070,29 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: min_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0
; GFX1132_DPP-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -14124,16 +13774,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: s_mov_b32 s6, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b32 s6, -1
; GFX8_DPP-NEXT: s_brev_b32 s7, -2
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -14221,16 +13871,16 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: s_mov_b32 s6, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b32 s6, -1
; GFX9_DPP-NEXT: s_brev_b32 s7, -2
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
+; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
@@ -14321,14 +13971,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, s6
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, s7
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14433,17 +14077,11 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, s6
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, s7
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14530,77 +14168,70 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, s6
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, s7
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14663,56 +14294,49 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, s6
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, s7
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15076,14 +14700,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15123,14 +14745,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15166,13 +14786,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: umax_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15221,13 +14838,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umax_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -15267,33 +14881,30 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: umax_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -15333,27 +14944,24 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: umax_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
@@ -16033,14 +15641,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
@@ -16131,14 +15733,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
@@ -16227,13 +15823,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -16337,20 +15929,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, 0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16433,77 +16019,70 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16564,56 +16143,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
@@ -16978,13 +16550,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: s_nop 0
; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17027,13 +16595,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: s_nop 0
; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -17072,13 +16636,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: umin_i32_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17127,13 +16688,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umin_i32_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -17173,33 +16731,30 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-LABEL: umin_i32_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_writelane_b32 v3, s4, 16
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 47
@@ -17239,31 +16794,29 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-LABEL: umin_i32_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v1, -1, -1
; GFX1132_DPP-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
@@ -17939,14 +17492,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX8_DPP-NEXT: s_not_b64 exec, exec
-; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX8_DPP-NEXT: s_nop 0
@@ -18037,14 +17584,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, v8
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX9_DPP-NEXT: s_not_b64 exec, exec
-; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX9_DPP-NEXT: s_nop 0
@@ -18133,13 +17674,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX1064_DPP-NEXT: s_not_b64 exec, exec
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -18243,20 +17780,14 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP: ; %bb.0: ; %entry
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, v9
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v10
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v3, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, -1
-; GFX1032_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18339,77 +17870,70 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
-; GFX1164_DPP-NEXT: s_not_b64 exec, exec
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18470,56 +17994,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v9
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v10
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
-; GFX1132_DPP-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v7, v1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_mov_b32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_mov_b32 v7, v1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v4, v8, v4
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v4, v6, v4
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 22eb8d05b5ff23..429e6c489bf6f8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -277,11 +277,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX8-NEXT: s_mov_b64 exec, s[10:11]
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: s_not_b64 exec, exec
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -334,11 +332,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX9-NEXT: s_mov_b64 exec, s[10:11]
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[10:11]
+; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-NEXT: s_nop 1
; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -386,13 +382,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1064-NEXT: s_cbranch_execz .LBB1_4
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1
-; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
+; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -449,13 +442,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1032-NEXT: s_and_saveexec_b32 s8, s9
; GFX1032-NEXT: s_cbranch_execz .LBB1_4
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s9, -1
-; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
+; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -503,32 +493,30 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11]
; GFX1164-NEXT: s_cbranch_execz .LBB1_4
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_not_b64 exec, exec
-; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_not_b64 exec, exec
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[10:11]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164-NEXT: v_permlanex16_b32 v2, v1, -1, -1
-; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s12, v1, 31
-; GFX1164-NEXT: v_mov_b32_e32 v2, s12
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s12
; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164-NEXT: v_readlane_b32 s12, v1, 15
; GFX1164-NEXT: v_readlane_b32 s13, v1, 31
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: v_writelane_b32 v3, s12, 16
; GFX1164-NEXT: s_mov_b64 exec, s[10:11]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1164-NEXT: v_readlane_b32 s12, v1, 63
@@ -577,31 +565,29 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: s_and_saveexec_b32 s8, s9
; GFX1132-NEXT: s_cbranch_execz .LBB1_4
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, v0, s9
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132-NEXT: v_permlanex16_b32 v2, v1, -1, -1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_readlane_b32 s11, v1, 31
; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132-NEXT: v_readlane_b32 s10, v1, 15
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_or_saveexec_b32 s9, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-NEXT: v_writelane_b32 v3, s10, 16
; GFX1132-NEXT: s_mov_b32 exec_lo, s9
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 0d74bd39b56fec..7aca63d34f51bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -12,12 +12,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: s_mov_b32 exec_lo, s4
; GCN-NEXT: s_or_saveexec_b32 s4, -1
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s4
-; GCN-NEXT: v_mov_b32_e32 v3, v0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s4, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s4
; GCN-NEXT: v_mov_b32_e32 v5, 0
@@ -27,12 +22,7 @@ define i32 @test(i32 %val, i32 %cond) {
; GCN-NEXT: ; %bb.1: ; %if
; GCN-NEXT: s_or_saveexec_b32 s5, -1
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s5
-; GCN-NEXT: v_mov_b32_e32 v3, v0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s5, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s5
; GCN-NEXT: v_mov_b32_e32 v5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
index 82dc6d21cfe33d..310f32ce8f83bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll
@@ -6,16 +6,13 @@
define amdgpu_hs void @wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
; GCN-LABEL: wwm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_mov_b32 s5, s2
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_mov_b32 s1, 1
-; GCN-NEXT: v_mov_b32_e32 v0, 4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s0, 0
@@ -63,16 +60,13 @@ work:
define amdgpu_hs void @strict_wwm(i32 inreg %arg, ptr addrspace(8) inreg %buffer) {
; GCN-LABEL: strict_wwm:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s6, s3
; GCN-NEXT: s_mov_b32 s5, s2
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b32 s7, s4
; GCN-NEXT: s_mov_b32 s4, s1
; GCN-NEXT: s_mov_b32 s1, 1
-; GCN-NEXT: v_mov_b32_e32 v0, 4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, 4, s[2:3]
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 584b280cefb8a8..311c6092918863 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,12 +816,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -900,14 +898,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -981,14 +974,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1046,43 +1034,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -1113,15 +1096,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2049,12 +2027,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2133,14 +2109,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2214,14 +2185,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2279,43 +2245,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -2346,15 +2307,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3342,12 +3298,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3426,14 +3380,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3507,14 +3456,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3572,43 +3516,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3639,15 +3578,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4131,12 +4065,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4215,14 +4147,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4296,14 +4223,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4361,43 +4283,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -4428,15 +4345,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5449,12 +5361,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5533,14 +5443,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5614,14 +5519,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5679,43 +5579,38 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5759,15 +5654,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7442,14 +7332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -7579,15 +7463,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7707,15 +7585,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7819,17 +7691,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7864,11 +7731,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -7946,16 +7813,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9047,14 +8909,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -9152,15 +9008,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9246,15 +9096,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9324,17 +9168,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9366,9 +9205,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -9419,16 +9259,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10497,14 +10332,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -10602,15 +10431,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10696,15 +10519,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10774,17 +10591,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10816,9 +10628,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -10869,16 +10682,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11429,14 +11237,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -11534,15 +11336,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11628,15 +11424,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11706,17 +11496,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11748,9 +11533,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -11801,16 +11587,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13526,14 +13307,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -13663,15 +13438,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13791,15 +13560,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13903,17 +13666,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -13948,11 +13706,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -14030,16 +13788,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 464ec088dc297b..9dc82b17bd3f4d 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 26a0e34d18bdbd..945583c88ce267 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,12 +718,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -811,15 +809,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -888,15 +881,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -947,46 +935,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -1023,42 +1006,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -1777,12 +1756,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1870,15 +1847,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1947,15 +1919,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2006,46 +1973,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -2082,42 +2044,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -2836,12 +2794,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2929,15 +2885,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1064-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3006,15 +2957,10 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, v0, s0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
; GFX1032-DPP-NEXT: v_min_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3065,46 +3011,41 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
@@ -3141,42 +3082,38 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x7fc00000, v0, s0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7fc00000
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
-; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -4807,14 +4744,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -4953,15 +4884,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5091,15 +5016,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5211,17 +5130,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5266,11 +5180,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -5350,16 +5264,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -6281,14 +6190,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -6395,15 +6298,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6484,15 +6381,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6555,17 +6446,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6607,9 +6493,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
; GFX1164-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -6663,16 +6550,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -8358,14 +8240,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -8504,15 +8380,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8642,15 +8512,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8762,17 +8626,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8817,11 +8676,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8901,16 +8760,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index c158a8007bcc53..3bc0f2546794d1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,12 +894,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -978,14 +976,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1059,14 +1052,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -1124,43 +1112,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -1204,15 +1187,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2239,12 +2217,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2323,14 +2299,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2404,14 +2375,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -2469,43 +2435,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -2549,15 +2510,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3584,12 +3540,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3668,14 +3622,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3749,14 +3698,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -3814,43 +3758,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -3894,15 +3833,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4425,12 +4359,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4509,14 +4441,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4590,14 +4517,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -4655,43 +4577,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -4735,15 +4652,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5769,12 +5681,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: s_mov_b64 exec, -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5853,14 +5763,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s[0:1]
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5934,14 +5839,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, v0, s0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
+; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
@@ -5999,43 +5899,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s[0:1]
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v1, 0, 0
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -6079,15 +5974,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v2, 0x80000000, v0, s0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7762,14 +7652,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -7899,15 +7783,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8027,15 +7905,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8139,17 +8011,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8184,11 +8051,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -8266,16 +8133,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9366,14 +9228,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -9471,15 +9327,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9565,15 +9415,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9643,17 +9487,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9685,9 +9524,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -9738,16 +9578,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -10816,14 +10651,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -10921,15 +10750,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11015,15 +10838,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11093,17 +10910,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11135,9 +10947,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -11188,16 +11001,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -11748,14 +11556,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9-DPP-NEXT: s_nop 0
@@ -11853,15 +11655,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11947,15 +11743,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -12025,17 +11815,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -12067,9 +11852,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
@@ -12120,16 +11906,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -13844,14 +13625,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-DPP-NEXT: s_not_b64 exec, exec
-; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX9-DPP-NEXT: s_nop 0
@@ -13981,15 +13756,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1064-DPP-NEXT: s_not_b64 exec, exec
-; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14109,15 +13878,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -14221,17 +13984,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1164-DPP-NEXT: s_not_b64 exec, exec
-; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14266,11 +14024,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfffe
; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1164-DPP-NEXT: ; %bb.1:
@@ -14348,16 +14106,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
-; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index b3acd4949301e1..c1b58f1795aaec 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -12,97 +12,204 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
; GFX11-LABEL: set_inactive_chain_arg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_mov_b32_e32 v0, v10
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: global_store_b32 v[8:9], v0, off
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: global_store_b32 v[8:9], v1, off
; GFX11-NEXT: s_endpgm
;
; GFX10-LABEL: set_inactive_chain_arg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-NEXT: v_mov_b32_e32 v0, v10
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: global_store_dword v[8:9], v0, off
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: global_store_dword v[8:9], v1, off
; GFX10-NEXT: s_endpgm
;
; GFX11_W64-LABEL: set_inactive_chain_arg:
; GFX11_W64: ; %bb.0:
; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: global_store_b32 v[8:9], v0, off
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT: global_store_b32 v[8:9], v1, off
; GFX11_W64-NEXT: s_endpgm
;
; GFX10_W64-LABEL: set_inactive_chain_arg:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: global_store_dword v[8:9], v0, off
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT: global_store_dword v[8:9], v1, off
; GFX10_W64-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 %active, i32 %inactive) #0
- store i32 %tmp, ptr addrspace(1) %out
+ %wwm = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp)
+ store i32 %wwm, ptr addrspace(1) %out
ret void
}
define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
-; GFX11-LABEL: set_inactive_chain_arg_64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v12
-; GFX11-NEXT: v_mov_b32_e32 v1, v13
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: v_mov_b32_e32 v0, v10
-; GFX11-NEXT: v_mov_b32_e32 v1, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT: s_endpgm
+; GISEL11-LABEL: set_inactive_chain_arg_64:
+; GISEL11: ; %bb.0:
+; GISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_mov_b32 v1, v11
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
+; GISEL11-NEXT: v_mov_b32_e32 v2, v0
+; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL11-NEXT: v_mov_b32_e32 v3, v1
+; GISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off
+; GISEL11-NEXT: s_endpgm
;
-; GFX10-LABEL: set_inactive_chain_arg_64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v12
-; GFX10-NEXT: v_mov_b32_e32 v1, v13
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v0, v10
-; GFX10-NEXT: v_mov_b32_e32 v1, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10-NEXT: s_endpgm
+; DAGISEL11-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11: ; %bb.0:
+; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT: s_endpgm
;
-; GFX11_W64-LABEL: set_inactive_chain_arg_64:
-; GFX11_W64: ; %bb.0:
-; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v12
-; GFX11_W64-NEXT: v_mov_b32_e32 v1, v13
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX11_W64-NEXT: v_mov_b32_e32 v1, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT: s_endpgm
+; GISEL10-LABEL: set_inactive_chain_arg_64:
+; GISEL10: ; %bb.0:
+; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_mov_b32_e32 v0, v10
+; GISEL10-NEXT: v_mov_b32_e32 v1, v11
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; GISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
+; GISEL10-NEXT: v_mov_b32_e32 v2, v0
+; GISEL10-NEXT: v_mov_b32_e32 v3, v1
+; GISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10-NEXT: s_endpgm
;
-; GFX10_W64-LABEL: set_inactive_chain_arg_64:
-; GFX10_W64: ; %bb.0:
-; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v12
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, v13
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: global_store_dwordx2 v[8:9], v[0:1], off
-; GFX10_W64-NEXT: s_endpgm
+; DAGISEL10-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10: ; %bb.0:
+; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
+; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT: s_endpgm
+;
+; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL11_W64: ; %bb.0:
+; GISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v10
+; GISEL11_W64-NEXT: v_mov_b32_e32 v1, v11
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL11_W64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL11_W64-NEXT: v_mov_b32_e32 v3, v1
+; GISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off
+; GISEL11_W64-NEXT: s_endpgm
+;
+; DAGISEL11_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL11_W64: ; %bb.0:
+; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT: s_endpgm
+;
+; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; GISEL10_W64: ; %bb.0:
+; GISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v10
+; GISEL10_W64-NEXT: v_mov_b32_e32 v1, v11
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GISEL10_W64-NEXT: v_mov_b32_e32 v2, v0
+; GISEL10_W64-NEXT: v_mov_b32_e32 v3, v1
+; GISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; GISEL10_W64-NEXT: s_endpgm
+;
+; DAGISEL10_W64-LABEL: set_inactive_chain_arg_64:
+; DAGISEL10_W64: ; %bb.0:
+; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1
+; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
- store i64 %tmp, ptr addrspace(1) %out
+ %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)
+ store i64 %wwm, ptr addrspace(1) %out
ret void
}
@@ -113,16 +220,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_mov_b32_e32 v0, v10
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v11
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v0
-; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v2, v1
; GFX11-NEXT: global_store_b32 v[8:9], v2, off
; GFX11-NEXT: s_endpgm
@@ -133,11 +237,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX10-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-NEXT: v_mov_b32_e32 v0, v10
; GFX10-NEXT: s_mov_b32 exec_lo, s0
-; GFX10-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v0, v0
-; GFX10-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX10-NEXT: s_mov_b32 exec_lo, s0
@@ -151,17 +252,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX11_W64-NEXT: s_not_b64 exec, exec
-; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11_W64-NEXT: v_mov_b32_e32 v0, v0
-; GFX11_W64-NEXT: s_not_b64 exec, exec
; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11_W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11_W64-NEXT: v_mov_b32_e32 v2, v1
; GFX11_W64-NEXT: global_store_b32 v[8:9], v2, off
; GFX11_W64-NEXT: s_endpgm
@@ -172,11 +269,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v11
-; GFX10_W64-NEXT: s_not_b64 exec, exec
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, v0
-; GFX10_W64-NEXT: s_not_b64 exec, exec
; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: v_mov_b32_dpp v1, v0 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
@@ -214,11 +308,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11-NEXT: v_mov_b32_e32 v11, 0
; GISEL11-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11-NEXT: s_endpgm
@@ -244,11 +337,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11-NEXT: s_endpgm
@@ -283,10 +375,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
; GISEL10-NEXT: v_mov_b32_e32 v0, v12
; GISEL10-NEXT: global_store_dword v[41:42], v0, off
; GISEL10-NEXT: s_endpgm
@@ -321,10 +412,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10-NEXT: s_endpgm
@@ -357,11 +447,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11_W64-NEXT: s_endpgm
@@ -394,11 +483,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11_W64-NEXT: s_endpgm
@@ -433,10 +521,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; GISEL10_W64-NEXT: s_endpgm
@@ -471,10 +558,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10_W64-NEXT: s_endpgm
@@ -511,11 +597,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0
; GISEL11-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL11-NEXT: s_mov_b32 exec_lo, s0
; GISEL11-NEXT: v_mov_b32_e32 v0, v12
; GISEL11-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11-NEXT: s_endpgm
@@ -541,11 +626,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL11-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11-NEXT: s_endpgm
@@ -580,10 +664,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; GISEL10-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; GISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; GISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; GISEL10-NEXT: s_mov_b32 exec_lo, s0
; GISEL10-NEXT: v_mov_b32_e32 v0, v12
; GISEL10-NEXT: global_store_dword v[41:42], v0, off
; GISEL10-NEXT: s_endpgm
@@ -618,10 +701,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
-; DAGISEL10-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10-NEXT: s_not_b32 exec_lo, exec_lo
+; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v12, v40, v43, s0
+; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL10-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10-NEXT: s_endpgm
@@ -654,11 +736,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; GISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL11_W64-NEXT: s_not_b64 exec, exec
-; GISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; GISEL11_W64-NEXT: s_endpgm
@@ -691,11 +772,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v11, 0
; DAGISEL11_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL11_W64-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL11_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL11_W64-NEXT: global_store_b32 v[41:42], v0, off
; DAGISEL11_W64-NEXT: s_endpgm
@@ -730,10 +810,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; GISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
-; GISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; GISEL10_W64-NEXT: s_not_b64 exec, exec
+; GISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; GISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; GISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; GISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; GISEL10_W64-NEXT: s_endpgm
@@ -768,10 +847,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
; DAGISEL10_W64-NEXT: s_mov_b64 s[2:3], s[50:51]
; DAGISEL10_W64-NEXT: s_waitcnt lgkmcnt(0)
; DAGISEL10_W64-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v43
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v12, v40
-; DAGISEL10_W64-NEXT: s_not_b64 exec, exec
+; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v12, v40, v43, s[0:1]
+; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v12
; DAGISEL10_W64-NEXT: global_store_dword v[41:42], v0, off
; DAGISEL10_W64-NEXT: s_endpgm
@@ -786,6 +864,7 @@ declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
declare amdgpu_gfx void @gfx_callee(<12 x i32>)
attributes #0 = { convergent readnone willreturn nocallback nofree}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 114d2d099ab7b1..6dc4a2ce0504b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,18 +5,22 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -25,13 +29,15 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -40,20 +46,25 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -63,13 +74,16 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -82,12 +96,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_not_b64 exec, exec
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, 56
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b64 s[2:3], -1
; GCN-NEXT: s_cbranch_scc1 .LBB4_3
; GCN-NEXT: ; %bb.1: ; %Flow
@@ -96,19 +113,20 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: .LBB4_2: ; %.exit
; GCN-NEXT: s_endpgm
; GCN-NEXT: .LBB4_3: ; %.one
-; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0
; GCN-NEXT: s_cbranch_execnz .LBB4_2
; GCN-NEXT: .LBB4_4: ; %.zero
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
br i1 %cmp, label %.zero, label %.one
.zero:
@@ -127,19 +145,23 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x40400000
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
+ %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0)
store float %tmp, ptr addrspace(1) %out
ret void
}
@@ -148,22 +170,27 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
+ %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0)
store double %tmp, ptr addrspace(1) %out
ret void
}
@@ -171,19 +198,23 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x10001
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
+ %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0)
store <2 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -191,19 +222,23 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x3c003c00
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
+ %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0)
store <2 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -212,22 +247,27 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 1
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 1
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0
+ %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0)
store <2 x i32> %tmp, ptr addrspace(1) %out
ret void
}
@@ -236,22 +276,27 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 1.0
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 1.0
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0
+ %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0)
store <2 x float> %tmp, ptr addrspace(1) %out
ret void
}
@@ -259,19 +304,23 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s7, 0x3f803f80
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0)
store <2 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -280,22 +329,27 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x10001
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x10001
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
+ %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
store <4 x i16> %tmp, ptr addrspace(1) %out
ret void
}
@@ -304,22 +358,27 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x3c003c00
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x3c003c00
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
+ %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
store <4 x half> %tmp, ptr addrspace(1) %out
ret void
}
@@ -328,22 +387,27 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_mov_b32 s8, 0x3f803f80
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s10, 0x3f803f80
+; GCN-NEXT: s_mov_b32 s11, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s9, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
+ %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
store <4 x bfloat> %tmp, ptr addrspace(1) %out
ret void
}
@@ -352,20 +416,25 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
+ %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0)
store ptr %tmp, ptr addrspace(1) %out
ret void
}
@@ -373,18 +442,22 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
+ %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0)
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
ret void
}
@@ -392,18 +465,22 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
+ %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0)
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
ret void
}
@@ -411,18 +488,22 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
+ %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0)
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
ret void
}
@@ -430,24 +511,31 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
- %tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
+ %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0)
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1
declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
attributes #0 = { convergent readnone }
+attributes #1 = { convergent nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index 81858bd3d29ee0..f60786c1bacbff 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -15,11 +15,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: s_cbranch_execz .LBB0_4
; GCN-NEXT: ; %bb.3: ; %.then
-; GCN-NEXT: v_mov_b32_e32 v1, v3
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_dpp v2, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
@@ -82,12 +79,7 @@ define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrs
; GCN-NEXT: .LBB1_5: ; %.else
; GCN-NEXT: s_or_saveexec_b32 s1, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b32 exec_lo, s1
-; GCN-NEXT: v_mov_b32_e32 v2, v3
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: s_or_saveexec_b32 s1, -1
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v3, s1
; GCN-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s1
; GCN-NEXT: v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 09e342fe190666..90b32e29e98f67 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -23,11 +23,8 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 i
; GCN-NEXT: s_cbranch_execz .LBB0_1
; GCN-NEXT: ; %bb.3: ; %bb1
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT: v_mov_b32_e32 v3, s4
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
-; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_not_b32 exec_lo, exec_lo
; GCN-NEXT: s_or_saveexec_b32 s9, -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, s4, s9
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GCN-NEXT: s_mov_b32 exec_lo, s9
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3a81771a2790c..ff692acda3c255 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1674,13 +1674,13 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
; GFX1032-NEXT: s_clause 0x1
; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v0, 42
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s4, s2
+; GFX1032-NEXT: s_mov_b32 exec_lo, s2
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: global_store_dword v1, v2, s[0:1]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive:
@@ -1688,15 +1688,16 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
; GFX1064-NEXT: s_clause 0x1
; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s4
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v0, 42
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s4, s[2:3]
+; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: global_store_dword v1, v2, s[0:1]
; GFX1064-NEXT: s_endpgm
- %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+ %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
+ %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
store i32 %tmp, ptr addrspace(1) %out
ret void
}
@@ -1705,31 +1706,32 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
; GFX1032-LABEL: test_set_inactive_64:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1032-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s6
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT: s_mov_b32 exec_lo, s0
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive_64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT: v_mov_b32_e32 v2, 0
+; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
-; GFX1064-NEXT: v_mov_b32_e32 v1, s7
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX1064-NEXT: s_endpgm
- %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+ %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
+ %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
store i64 %tmp, ptr addrspace(1) %out
ret void
}
@@ -2921,6 +2923,8 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
declare float @llvm.amdgcn.strict.wwm.f32(float)
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
+declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
declare float @llvm.amdgcn.wwm.f32(float)
declare i32 @llvm.amdgcn.wqm.i32(i32)
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 6b4c2da772cdc2..ab84c0c905771b 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -835,12 +835,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
@@ -851,12 +848,9 @@ define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
@@ -1317,7 +1311,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
-; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
+; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
@@ -1334,7 +1328,7 @@ define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
+; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
@@ -2263,11 +2257,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -2293,11 +2284,8 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -2744,12 +2732,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
@@ -2760,12 +2745,9 @@ define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
@@ -2799,11 +2781,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
-; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -2829,11 +2808,8 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index ef6d0780f395fd..534865173d9a59 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -282,10 +282,10 @@ body: |
#
#CHECK-NOT: ENTER_STRICT_WWM
#CHECK: BUFFER_LOAD_DWORDX2
-#CHECK-NOT: ENTER_STRICT_WWM
+#CHECK: ENTER_STRICT_WWM
#CHECK: V_SET_INACTIVE_B32
#CHECK: V_SET_INACTIVE_B32
-#CHECK: ENTER_STRICT_WWM
+#CHECK-NOT: ENTER_STRICT_WWM
#CHECK: V_MAX
name: test_wwm_set_inactive_propagation
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd7760..47e1897f6b420a 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -80,17 +80,10 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[34:35]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -177,11 +170,11 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -208,12 +201,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -270,34 +259,25 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[34:35]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[36:37]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB1_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -378,26 +358,26 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b32 s42, s6
; GFX9-O0-NEXT: s_mov_b32 s34, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
-; GFX9-O0-NEXT: s_mov_b32 s42, s41
-; GFX9-O0-NEXT: s_mov_b32 s43, s40
+; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT: s_mov_b32 s43, s7
+; GFX9-O0-NEXT: s_mov_b32 s44, s43
+; GFX9-O0-NEXT: s_mov_b32 s45, s42
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s5
-; GFX9-O0-NEXT: s_mov_b32 s44, s35
+; GFX9-O0-NEXT: s_mov_b32 s46, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s37, s44
-; GFX9-O0-NEXT: s_mov_b32 s38, s43
-; GFX9-O0-NEXT: s_mov_b32 s39, s42
+; GFX9-O0-NEXT: s_mov_b32 s37, s46
+; GFX9-O0-NEXT: s_mov_b32 s38, s45
+; GFX9-O0-NEXT: s_mov_b32 s39, s44
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_getpc_b64 s[42:43]
; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called at rel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called at rel32@hi+12
@@ -437,11 +417,11 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
@@ -559,7 +539,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-LABEL: strict_wwm_call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s48, s33
+; GFX9-O0-NEXT: s_mov_b32 s50, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -583,41 +563,41 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0
+; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1
; GFX9-O0-NEXT: s_mov_b32 s34, s8
-; GFX9-O0-NEXT: s_mov_b32 s38, s6
+; GFX9-O0-NEXT: s_mov_b32 s40, s6
; GFX9-O0-NEXT: s_mov_b32 s36, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s39, s7
-; GFX9-O0-NEXT: s_mov_b32 s35, s39
-; GFX9-O0-NEXT: s_mov_b32 s44, s38
+; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s35, s41
+; GFX9-O0-NEXT: s_mov_b32 s42, s40
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
; GFX9-O0-NEXT: s_mov_b32 s37, s5
-; GFX9-O0-NEXT: s_mov_b32 s45, s37
-; GFX9-O0-NEXT: s_mov_b32 s40, s36
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
-; GFX9-O0-NEXT: s_mov_b32 s41, s45
-; GFX9-O0-NEXT: s_mov_b32 s42, s44
-; GFX9-O0-NEXT: s_mov_b32 s43, s35
-; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2
-; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3
+; GFX9-O0-NEXT: s_mov_b32 s43, s37
+; GFX9-O0-NEXT: s_mov_b32 s44, s36
+; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
+; GFX9-O0-NEXT: s_mov_b32 s45, s43
+; GFX9-O0-NEXT: s_mov_b32 s46, s42
+; GFX9-O0-NEXT: s_mov_b32 s47, s35
+; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2
+; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3
+; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s9
; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
; GFX9-O0-NEXT: s_mov_b32 s34, 32
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
@@ -634,20 +614,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4
-; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5
-; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0
-; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1
-; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2
-; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3
+; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0
+; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1
+; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2
+; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3
+; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4
+; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
@@ -679,14 +659,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT: s_mov_b32 s33, s48
+; GFX9-O0-NEXT: s_mov_b32 s33, s50
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O3-LABEL: strict_wwm_call_i64:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT: s_mov_b32 s40, s33
+; GFX9-O3-NEXT: s_mov_b32 s38, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -702,28 +682,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: s_getpc_b64 s[34:35]
+; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
@@ -739,7 +717,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT: s_mov_b32 s33, s40
+; GFX9-O3-NEXT: s_mov_b32 s33, s38
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -778,16 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s40, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s35
+; GFX9-O0-NEXT: s_mov_b32 s42, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
+; GFX9-O0-NEXT: s_mov_b32 s43, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -796,21 +776,25 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s40
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -851,28 +835,30 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s34, -1
-; GFX9-O3-NEXT: s_brev_b32 s35, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: s_mov_b32 s36, -1
+; GFX9-O3-NEXT: s_brev_b32 s37, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s34
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s35
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
@@ -922,21 +908,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -987,130 +961,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25
-; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26
-; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27
-; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23
+; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26
+; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46
-; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45
-; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44
-; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43
-; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT: s_waitcnt vmcnt(3)
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35
+; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec
; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1150,62 +1104,82 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0
+; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10
; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v32
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v36
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5
@@ -1245,16 +1219,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -1265,73 +1231,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32
; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4
; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8
; GFX9-O3-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16
; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
-; GFX9-O3-NEXT: v_mov_b32_e32 v32, v1
-; GFX9-O3-NEXT: v_mov_b32_e32 v33, v2
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v32, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v33, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v34, v3
-; GFX9-O3-NEXT: v_mov_b32_e32 v35, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v34, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v35, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v36, v5
-; GFX9-O3-NEXT: v_mov_b32_e32 v37, v6
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v36, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v37, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v38, v7
-; GFX9-O3-NEXT: v_mov_b32_e32 v39, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9
-; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4
-; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_store_dword v35, v0, s[0:3], 0 offen offset:12
-; GFX9-O3-NEXT: buffer_store_dword v34, v0, s[0:3], 0 offen offset:8
-; GFX9-O3-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen offset:20
-; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28
-; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
-; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
-; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: s_nop 0
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32
+; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33
+; GFX9-O3-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX9-O3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX9-O3-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX9-O3-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX9-O3-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX9-O3-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX9-O3-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX9-O3-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6
@@ -1359,24 +1308,21 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28
; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
- %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
- %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
- %c2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
- %d2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
- %e2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+ %a2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
+ %a2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %a2.i)
+ %b2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
+ %b2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %b2.i)
+ %c2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %c, i64 0)
+ %c2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %c2.i)
+ %d2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %d, i64 0)
+ %d2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %d2.i)
+ %e2.i = call i64 @llvm.amdgcn.set.inactive.i64(i64 %e, i64 0)
+ %e2 = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %e2.i)
store i64 %a2, ptr addrspace(5) %ptr
%ptr_b = getelementptr i64, ptr addrspace(5) %ptr, i32 1
store i64 %b2, ptr addrspace(5) %ptr_b
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index a74dbe1de0d39e..7f0db3e362b308 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -63,17 +63,10 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -154,11 +147,11 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -185,12 +178,8 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
@@ -236,34 +225,25 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB1_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -361,35 +341,35 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s7
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s9
-; GFX9-O0-NEXT: s_mov_b32 s16, s8
+; GFX9-O0-NEXT: s_mov_b32 s3, s9
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s17
+; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s7
-; GFX9-O0-NEXT: s_mov_b32 s18, s6
+; GFX9-O0-NEXT: s_mov_b32 s17, s9
+; GFX9-O0-NEXT: s_mov_b32 s18, s8
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9
; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -418,13 +398,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -454,12 +434,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
@@ -613,35 +593,35 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: s_mov_b32 s7, s8
-; GFX9-O0-NEXT: s_mov_b32 s8, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s19
+; GFX9-O0-NEXT: s_mov_b32 s7, s18
+; GFX9-O0-NEXT: s_mov_b32 s15, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s15
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -678,12 +658,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -721,14 +701,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -792,16 +772,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s6, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -810,21 +792,25 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -848,28 +834,30 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s4, -1
-; GFX9-O3-NEXT: s_brev_b32 s5, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: s_mov_b32 s6, -1
+; GFX9-O3-NEXT: s_brev_b32 s7, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
@@ -927,15 +915,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -964,17 +952,10 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5]
+; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
@@ -1055,11 +1036,11 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1086,12 +1067,8 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
@@ -1137,34 +1114,25 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-O3-NEXT: s_cbranch_execz .LBB8_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7]
+; GFX9-O3-NEXT: s_nop 1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
-; GFX9-O3-NEXT: .LBB8_2: ; %merge
+; GFX9-O3-NEXT: ; %bb.2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1262,35 +1230,35 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s7
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s9
-; GFX9-O0-NEXT: s_mov_b32 s16, s8
+; GFX9-O0-NEXT: s_mov_b32 s3, s9
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s17
+; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s7
-; GFX9-O0-NEXT: s_mov_b32 s18, s6
+; GFX9-O0-NEXT: s_mov_b32 s17, s9
+; GFX9-O0-NEXT: s_mov_b32 s18, s8
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 8
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 9
; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1319,13 +1287,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1355,12 +1323,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
@@ -1514,35 +1482,35 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
+; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s9
-; GFX9-O0-NEXT: s_mov_b32 s7, s8
-; GFX9-O0-NEXT: s_mov_b32 s8, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s19
+; GFX9-O0-NEXT: s_mov_b32 s7, s18
+; GFX9-O0-NEXT: s_mov_b32 s15, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s15
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 5
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 7
-; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s2, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
+; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
+; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
+; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1579,12 +1547,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1622,14 +1590,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -1693,16 +1661,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s6, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
-; GFX9-O0-NEXT: s_mov_b32 s7, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
@@ -1711,21 +1681,25 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT: s_not_b64 exec, exec
+; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
@@ -1749,28 +1723,30 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
-; GFX9-O3-NEXT: s_mov_b32 s4, -1
-; GFX9-O3-NEXT: s_brev_b32 s5, -2
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT: s_mov_b32 s6, -1
+; GFX9-O3-NEXT: s_brev_b32 s7, -2
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-O3-NEXT: s_not_b64 exec, exec
+; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index b3ed7376a1ede6..f73489b7db77cf 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -273,12 +273,15 @@ define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg
%ld1 = load volatile i32, ptr addrspace(1) %ptr
%inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0)
%inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0)
- store volatile i32 %inactive0, ptr addrspace(1) %ptr
- store volatile i32 %inactive1, ptr addrspace(1) %ptr
+ %wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0)
+ %wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1)
+ store volatile i32 %wwm0, ptr addrspace(1) %ptr
+ store volatile i32 %wwm1, ptr addrspace(1) %ptr
ret void
}
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6
+declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6
attributes #0 = { "no-signed-zeros-fp-math" = "true" }
attributes #1 = { "amdgpu-dx10-clamp" = "false" }
>From 58dbdda22fa7573b9b2fba21e4abff8c167b4bd2 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 29 Aug 2024 14:05:23 +0900
Subject: [PATCH 2/5] - Address reviewer comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6485b188e8422a..7129ef94251d24 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2280,7 +2280,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32;
- Register ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ Register ExecReg = RI.getExec();
Register DstReg = MI.getOperand(0).getReg();
MachineOperand &ActiveSrc = MI.getOperand(1);
MachineOperand &InactiveSrc = MI.getOperand(2);
@@ -2307,7 +2307,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// present an issue.
// Fallback to V_MOV base lowering in all but the common cases.
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
- const MachineFunction *MF = MI.getParent()->getParent();
+ const MachineFunction *MF = MBB.getParent();
const MachineRegisterInfo &MRI = MF->getRegInfo();
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
const MCInstrDesc &Desc = get(Opcode);
>From 44afd5f0f7543ed75fa79c2dc9b6a7dd6dbbdfd7 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Mon, 2 Sep 2024 15:27:39 +0900
Subject: [PATCH 3/5] - Address reviewer comments. - Add findImplicitExecSrc
helper. - Use helper to ignore V_SET_INACTIVE instructions during WQM/WWM
processing. This allows other passes to emit V_SET_INACTIVE for already
known WWM sections. This supports #105822. - Add test for above.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 65 ++++++++++++----------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 26 +++++----
llvm/test/CodeGen/AMDGPU/wqm.mir | 19 +++++++
4 files changed, 74 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7129ef94251d24..4c29ae326888f6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2098,8 +2098,20 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
}
+Register SIInstrInfo::findImplicitExecSrc(const MachineInstr &MI) {
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ return OpReg;
+ }
+ return Register();
+}
+
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
@@ -2286,21 +2298,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineOperand &InactiveSrc = MI.getOperand(2);
// Find implicit register defining lanes active outside WWM.
+ Register ExecSrcReg = findImplicitExecSrc(MI);
+ assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
// Note: default here is set to ExecReg so that functional MIR is still
// generated if implicit def is not found and assertions are disabled.
- Register ExecSrcReg = ExecReg;
- for (auto &Op : MI.implicit_operands()) {
- if (Op.isDef() || !Op.isReg())
- continue;
- Register OpReg = Op.getReg();
- if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::SCC)
- continue;
- ExecSrcReg = OpReg;
- break;
- }
- assert(ExecSrcReg != ExecReg &&
- "V_SET_INACTIVE must be in known WWM region");
+ if (!ExecSrcReg)
+ ExecSrcReg = ExecReg;
// Ideally in WWM this operation is lowered to V_CNDMASK; however,
// constant bus constraints and the presence of literal constants
@@ -2329,8 +2332,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
int LiteralConstants =
- (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
- (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+ ((ActiveSrc.isReg() ||
+ (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
+ ? 0
+ : 1) +
+ ((InactiveSrc.isReg() ||
+ (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
+ ? 0
+ : 1);
bool UseVCndMask =
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
@@ -2338,11 +2347,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// Decomposition must not introduce new literals.
UseVCndMask &=
ActiveSrc.isReg() ||
- (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
(!isInlineConstant(ActiveImm));
UseVCndMask &= InactiveSrc.isReg() ||
(isInlineConstant(InactiveImmLo) &&
- isInlineConstant(InactiveImmLo)) ||
+ isInlineConstant(InactiveImmHi)) ||
(!isInlineConstant(InactiveImm));
}
@@ -2352,34 +2361,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
ActiveSrc.isReg()
? MachineOperand::CreateReg(
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
- /*isImp=*/false, /*isKill*/ false)
+ /*isImp=*/false, /*isKill=*/false)
: MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
MachineOperand ActiveHi =
ActiveSrc.isReg()
? MachineOperand::CreateReg(
RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
- /*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
+ /*isImp=*/false, /*isKill=*/ActiveSrc.isKill())
: MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
MachineOperand InactiveLo =
InactiveSrc.isReg()
? MachineOperand::CreateReg(
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
- /*isImp=*/false, /*isKill*/ false)
+ /*isImp=*/false, /*isKill=*/false)
: MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
MachineOperand InactiveHi =
InactiveSrc.isReg()
? MachineOperand::CreateReg(
RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
- /*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
+ /*isImp=*/false, /*isKill=*/InactiveSrc.isKill())
: MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
- BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
+ BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
.addImm(0)
.add(InactiveLo)
.addImm(0)
.add(ActiveLo)
.addReg(ExecSrcReg)
.addReg(DstReg, RegState::ImplicitDefine);
- BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
+ BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
.addImm(0)
.add(InactiveHi)
.addImm(0)
@@ -2388,7 +2397,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(DstReg, RegState::ImplicitDefine);
} else if (UseVCndMask) {
// Single V_CNDMASK_B32
- BuildMI(MBB, MI, DL, get(Opcode), DstReg)
+ BuildMI(MBB, MI, DL, Desc, DstReg)
.addImm(0)
.add(InactiveSrc)
.addImm(0)
@@ -2406,9 +2415,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// Set exec mask to inactive lanes,
// but only if active lanes would be overwritten.
if (DstIsActive) {
- MachineInstr *ExecMI =
- BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
- ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
+ .addReg(ExecSrcReg)
+ .setOperandDead(3); // Dead scc
}
// Copy inactive lanes
MachineInstr *VMov =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4fd9b4366159be..958094906765ed 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,6 +1437,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
+
+ static Register findImplicitExecSrc(const MachineInstr &MI);
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index fe2b40db1d4ea8..4eb1ebc801f5da 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -559,18 +559,24 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
- // Disable strict states; StrictWQM will be added as required later.
- III.Disabled = StateStrict;
- MachineOperand &Inactive = MI.getOperand(2);
- if (Inactive.isReg()) {
- if (Inactive.isUndef()) {
- LowerToCopyInstrs.insert(&MI);
- } else {
- markOperand(MI, Inactive, StateStrictWWM, Worklist);
+ // Ignore these if V_SET_INACTIVE which already has exec src register.
+ // These are generated by an earlier pass which has seperately ensured
+ // WWM and provided a mask of inactive lanes.
+ Register ExecSrc = TII->findImplicitExecSrc(MI);
+ if (!ExecSrc) {
+ // Disable strict states; StrictWQM will be added as required later.
+ III.Disabled = StateStrict;
+ MachineOperand &Inactive = MI.getOperand(2);
+ if (Inactive.isReg()) {
+ if (Inactive.isUndef()) {
+ LowerToCopyInstrs.insert(&MI);
+ } else {
+ markOperand(MI, Inactive, StateStrictWWM, Worklist);
+ }
}
+ SetInactiveInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
}
- SetInactiveInstrs.push_back(&MI);
- BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 534865173d9a59..64a7c4457395c0 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -40,6 +40,9 @@
define amdgpu_vs void @no_wqm_in_vs() {
ret void
}
+ define amdgpu_ps void @preloaded_set_inactive() {
+ ret void
+ }
...
---
@@ -443,3 +446,19 @@ body: |
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
...
+
+---
+# Preserve V_SET_INACTIVE with exec mask already specified
+#CHECK-LABEL: name: preloaded_set_inactive
+#CHECK: V_SET_INACTIVE_B32
+name: preloaded_set_inactive
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr1, $vgpr2
+
+ %0:vgpr_32 = COPY $vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64
+...
>From 439661b3a2a7f6ab88965081dcda3fdca697b0de Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Tue, 3 Sep 2024 15:59:30 +0900
Subject: [PATCH 4/5] findImplicitExecSrc -> findSetInactiveMask
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ++++--
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 2 +-
3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 4c29ae326888f6..3fd8eb3cfd33c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2098,7 +2098,9 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
}
-Register SIInstrInfo::findImplicitExecSrc(const MachineInstr &MI) {
+Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
+ assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+ MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
for (auto &Op : MI.implicit_operands()) {
if (Op.isDef())
continue;
@@ -2298,7 +2300,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineOperand &InactiveSrc = MI.getOperand(2);
// Find implicit register defining lanes active outside WWM.
- Register ExecSrcReg = findImplicitExecSrc(MI);
+ Register ExecSrcReg = findSetInactiveMask(MI);
assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
// Note: default here is set to ExecReg so that functional MIR is still
// generated if implicit def is not found and assertions are disabled.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 958094906765ed..71432510fdee4f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1438,7 +1438,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
// regardless.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
- static Register findImplicitExecSrc(const MachineInstr &MI);
+ static Register findSetInactiveMask(const MachineInstr &MI);
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 4eb1ebc801f5da..c6588147443d35 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -562,7 +562,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// Ignore these if V_SET_INACTIVE which already has exec src register.
// These are generated by an earlier pass which has seperately ensured
// WWM and provided a mask of inactive lanes.
- Register ExecSrc = TII->findImplicitExecSrc(MI);
+ Register ExecSrc = TII->findSetInactiveMask(MI);
if (!ExecSrc) {
// Disable strict states; StrictWQM will be added as required later.
III.Disabled = StateStrict;
>From d1f19e77855da9fd5f851540049b860c36bf0895 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Wed, 4 Sep 2024 14:45:21 +0900
Subject: [PATCH 5/5] - Address reviewer comments
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 43 +++++++++-------------
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 3 +-
2 files changed, 19 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3fd8eb3cfd33c6..844f62abc26717 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2312,8 +2312,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// present an issue.
// Fallback to V_MOV base lowering in all but the common cases.
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
- const MachineFunction *MF = MBB.getParent();
- const MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
const MCInstrDesc &Desc = get(Opcode);
@@ -2359,30 +2359,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
if (UseVCndMask && VMov64) {
// Dual V_CNDMASK_B32
- MachineOperand ActiveLo =
- ActiveSrc.isReg()
- ? MachineOperand::CreateReg(
- RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
- /*isImp=*/false, /*isKill=*/false)
- : MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
- MachineOperand ActiveHi =
- ActiveSrc.isReg()
- ? MachineOperand::CreateReg(
- RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
- /*isImp=*/false, /*isKill=*/ActiveSrc.isKill())
- : MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
- MachineOperand InactiveLo =
- InactiveSrc.isReg()
- ? MachineOperand::CreateReg(
- RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
- /*isImp=*/false, /*isKill=*/false)
- : MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
- MachineOperand InactiveHi =
- InactiveSrc.isReg()
- ? MachineOperand::CreateReg(
- RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
- /*isImp=*/false, /*isKill=*/InactiveSrc.isKill())
- : MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
+ MachineOperand ActiveLo = buildExtractSubRegOrImm(
+ MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
+ MachineOperand ActiveHi = buildExtractSubRegOrImm(
+ MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
+ MachineOperand InactiveLo = buildExtractSubRegOrImm(
+ MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
+ MachineOperand InactiveHi = buildExtractSubRegOrImm(
+ MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
+ if (ActiveSrc.isReg())
+ ActiveHi.setIsKill(ActiveSrc.isKill());
+ if (InactiveSrc.isReg())
+ InactiveHi.setIsKill(InactiveSrc.isKill());
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
.addImm(0)
.add(InactiveLo)
@@ -5783,6 +5771,9 @@ unsigned SIInstrInfo::buildExtractSubReg(
MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
unsigned SubIdx, const TargetRegisterClass *SubRC) const {
+ if (!SuperReg.getReg().isVirtual())
+ return RI.getSubReg(SuperReg.getReg(), SubIdx);
+
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
Register SubReg = MRI.createVirtualRegister(SubRC);
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index c6588147443d35..bc4b1936cb7e38 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -1082,8 +1082,9 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
if (ActiveLanesReg) {
MI.addOperand(*MBB.getParent(),
MachineOperand::CreateReg(ActiveLanesReg, false, true));
- } else
+ } else {
assert(State == StateExact || State == StateWQM);
+ }
break;
default:
break;
More information about the llvm-commits
mailing list