[llvm] 208332d - [AMDGPU] Add Optimize VGPR LiveRange Pass.
Ruiling Song via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 21 00:27:17 PDT 2021
Author: Ruiling Song
Date: 2021-06-21T15:25:55+08:00
New Revision: 208332de8abf126b6fb5590bea47cd12257bc064
URL: https://github.com/llvm/llvm-project/commit/208332de8abf126b6fb5590bea47cd12257bc064
DIFF: https://github.com/llvm/llvm-project/commit/208332de8abf126b6fb5590bea47cd12257bc064.diff
LOG: [AMDGPU] Add Optimize VGPR LiveRange Pass.
This pass aims to optimize VGPR live-range in a typical divergent if-else
control flow. For example:
def(a)
if(cond)
use(a)
... // A
else
use(a)
As AMDGPU access vgpr with respect to active-mask, we can mark `a` as
dead in region A. For details, please refer to the comments in
implementation file.
The pass is enabled by default, the frontend can disable it through
"-amdgpu-opt-vgpr-liverange=false".
Differential Revision: https://reviews.llvm.org/D102212
Added:
llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
llvm/test/CodeGen/AMDGPU/bypass-div.ll
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 2cfda5533dbb9..fa3c7e657fc1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
+FunctionPass *createSIOptimizeVGPRLiveRangePass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
@@ -297,6 +298,9 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
extern char &SIOptimizeExecMaskingPreRAID;
+void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &);
+extern char &SIOptimizeVGPRLiveRangeID;
+
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 63b3a8d3b29e0..2c1e5092b26b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -162,6 +162,11 @@ static cl::opt<bool> EnableRegReassign(
cl::init(true),
cl::Hidden);
+static cl::opt<bool> OptVGPRLiveRange(
+ "amdgpu-opt-vgpr-liverange",
+ cl::desc("Enable VGPR liverange optimizations for if-else structure"),
+ cl::init(true), cl::Hidden);
+
// Enable atomic optimization
static cl::opt<bool> EnableAtomicOptimizations(
"amdgpu-atomic-optimizations",
@@ -225,6 +230,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);
+ initializeSIOptimizeVGPRLiveRangePass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUFixFunctionBitcastsPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
@@ -1190,6 +1196,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (TM->getOptLevel() > CodeGenOpt::Less)
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
+ // FIXME: when an instruction has a Killed operand, and the instruction is
+ // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+ // the register in LiveVariables, this would trigger a failure in verifier,
+ // we should fix it and enable the verifier.
+ if (OptVGPRLiveRange)
+ insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index bf44ad6a000d4..0e3ea8d313a26 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -132,6 +132,7 @@ add_llvm_target(AMDGPUCodeGen
SIMemoryLegalizer.cpp
SIOptimizeExecMasking.cpp
SIOptimizeExecMaskingPreRA.cpp
+ SIOptimizeVGPRLiveRange.cpp
SIPeepholeSDWA.cpp
SIPostRABundler.cpp
SIPreEmitPeephole.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
new file mode 100644
index 0000000000000..8e8d4bff672ff
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -0,0 +1,497 @@
+//===--------------------- SIOptimizeVGPRLiveRange.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass tries to remove unnecessary VGPR live range in divergent if-else
+/// structure.
+///
+/// When we do structurization, we usually transform a if-else into two
+/// sucessive if-then (with a flow block to do predicate inversion). Consider a
+/// simple case after structurization: A divergent value %a was defined before
+/// if-else and used in both THEN (use in THEN is optional) and ELSE part:
+/// bb.if:
+/// %a = ...
+/// ...
+/// bb.then:
+/// ... = op %a
+/// ... // %a can be dead here
+/// bb.flow:
+/// ...
+/// bb.else:
+/// ... = %a
+/// ...
+/// bb.endif
+///
+/// As register allocator has no idea of the thread-control-flow, it will just
+/// assume %a would be alive in the whole range of bb.then because of a later
+/// use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
+/// to exec mask. For this if-else case, the lanes active in bb.then will be
+/// inactive in bb.else, and vice-verse. So we are safe to say that %a was dead
+/// after the last use in bb.then untill the end of the block. The reason is
+/// the instructions in bb.then will only overwrite lanes that will never be
+/// accessed in bb.else.
+///
+/// This pass aims to to tell register allocator that %a is in-fact dead,
+/// through inserting a phi-node in bb.flow saying that %a is undef when coming
+/// from bb.then, and then replace the uses in the bb.else with the result of
+/// newly inserted phi.
+///
+/// Two key conditions must be met to ensure correctness:
+/// 1.) The def-point should be in the same loop-level as if-else-endif to make
+/// sure the second loop iteration still get correct data.
+/// 2.) There should be no further uses after the IF-ELSE region.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-opt-vgpr-liverange"
+
+namespace {
+
+class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ LiveVariables *LV = nullptr;
+ MachineDominatorTree *MDT = nullptr;
+ const MachineLoopInfo *Loops = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+
+public:
+ static char ID;
+
+ MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const;
+
+ void collectElseRegionBlocks(MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &) const;
+
+ void
+ collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const;
+
+ void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const;
+
+ void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow) const;
+
+ void updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ void
+ optimizeLiveRange(Register Reg, MachineBasicBlock *If,
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+ SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "SI Optimize VGPR LiveRange";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveVariables>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<LiveVariables>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::IsSSA);
+ }
+};
+
+} // end anonymous namespace
+
+// Check whether the MBB is a else flow block and get the branching target which
+// is the Endif block
+MachineBasicBlock *
+SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
+ for (auto &BR : MBB->terminators()) {
+ if (BR.getOpcode() == AMDGPU::SI_ELSE)
+ return BR.getOperand(2).getMBB();
+ }
+ return nullptr;
+}
+
+void SIOptimizeVGPRLiveRange::collectElseRegionBlocks(
+ MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &Blocks) const {
+ assert(Flow != Endif);
+
+ MachineBasicBlock *MBB = Endif;
+ unsigned Cur = 0;
+ while (MBB) {
+ for (auto *Pred : MBB->predecessors()) {
+ if (Pred != Flow && !Blocks.contains(Pred))
+ Blocks.insert(Pred);
+ }
+
+ if (Cur < Blocks.size())
+ MBB = Blocks[Cur++];
+ else
+ MBB = nullptr;
+ }
+
+ LLVM_DEBUG(dbgs() << "Found Else blocks: ");
+ for (auto *MBB : Blocks)
+ LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ' ');
+ LLVM_DEBUG(dbgs() << '\n');
+}
+
+/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg.
+void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock(
+ Register Reg, MachineBasicBlock *MBB,
+ SmallVectorImpl<MachineInstr *> &Uses) const {
+ for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) {
+ if (UseMI.getParent() == MBB && !UseMI.isPHI())
+ Uses.push_back(&UseMI);
+ }
+}
+
+/// Collect the killed registers in the ELSE region which are not alive through
+/// the whole THEN region.
+void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
+ MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+ SmallVectorImpl<Register> &CandidateRegs) const {
+
+ SmallSet<Register, 8> KillsInElse;
+
+ for (auto *Else : ElseBlocks) {
+ for (auto &MI : Else->instrs()) {
+ if (MI.isDebugInstr())
+ continue;
+
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ continue;
+
+ Register MOReg = MO.getReg();
+ // We can only optimize AGPR/VGPR virtual register
+ if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+ continue;
+
+ if (MO.isKill() && MO.readsReg()) {
+ LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+ KillsInElse.insert(MOReg);
+ }
+ }
+ }
+ }
+
+ // Check the phis in the Endif, looking for value coming from the ELSE
+ // region. Make sure the phi-use is the last use.
+ for (auto &MI : Endif->phis()) {
+ for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+ auto &MO = MI.getOperand(Idx);
+ auto *Pred = MI.getOperand(Idx + 1).getMBB();
+ if (Pred == Flow)
+ continue;
+ assert(ElseBlocks.contains(Pred) && "Should be from Else region\n");
+
+ if (!MO.isReg() || !MO.getReg() || MO.isUndef())
+ continue;
+
+ Register Reg = MO.getReg();
+ if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg))
+ continue;
+
+ LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+ if (VI.isLiveIn(*Endif, Reg, *MRI)) {
+ LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI)
+ << " as Live in Endif\n");
+ continue;
+ }
+ // Make sure two conditions are met:
+ // a.) the value is defined before/in the IF block
+ // b.) should be defined in the same loop-level.
+ const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent();
+ if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+ Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+ KillsInElse.insert(Reg);
+ }
+ }
+
+ auto IsLiveThroughThen = [&](Register Reg) {
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ if (!I->readsReg())
+ continue;
+ auto *UseMI = I->getParent();
+ auto *UseMBB = UseMI->getParent();
+ if (UseMBB == Flow || UseMBB == Endif) {
+ if (!UseMI->isPHI())
+ return true;
+
+ auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB();
+ // The register is live through the path If->Flow or Flow->Endif.
+ // we should not optimize for such cases.
+ if ((UseMBB == Flow && IncomingMBB != If) ||
+ (UseMBB == Endif && IncomingMBB == Flow))
+ return true;
+ }
+ }
+ return false;
+ };
+
+ for (auto Reg : KillsInElse) {
+ if (!IsLiveThroughThen(Reg))
+ CandidateRegs.push_back(Reg);
+ }
+}
+
+// Re-calculate the liveness of \p Reg in the THEN-region
+void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
+
+ SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
+
+ MachineBasicBlock *ThenEntry = nullptr;
+ for (auto *Succ : If->successors()) {
+ if (Succ != Flow) {
+ ThenEntry = Succ;
+ break;
+ }
+ }
+ assert(ThenEntry && "No successor in Then region?");
+
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ df_iterator_default_set<MachineBasicBlock *, 16> Visited;
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ // Clear Live bit, as we will recalculate afterwards
+ LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(MBB->getNumber());
+ }
+
+ // Get the blocks the Reg should be alive through
+ for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+ ++I) {
+ auto *UseMI = I->getParent();
+ if (UseMI->isPHI() && I->readsReg()) {
+ if (Visited.contains(UseMI->getParent()))
+ PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
+ }
+ }
+
+ Visited.clear();
+
+ for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+ if (MBB == Flow)
+ break;
+
+ SmallVector<MachineInstr *> Uses;
+ // PHI instructions has been processed before.
+ findNonPHIUsesInBlock(Reg, MBB, Uses);
+
+ if (Uses.size() == 1) {
+ LLVM_DEBUG(dbgs() << "Found one Non-PHI use in "
+ << printMBBReference(*MBB) << '\n');
+ LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin()));
+ } else if (Uses.size() > 1) {
+ // Process the instructions in-order
+ LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in "
+ << printMBBReference(*MBB) << '\n');
+ for (MachineInstr &MI : *MBB) {
+ if (llvm::is_contained(Uses, &MI))
+ LV->HandleVirtRegUse(Reg, MBB, MI);
+ }
+ }
+
+ // Mark Reg alive through the block if this is a PHI incoming block
+ if (PHIIncoming.contains(MBB))
+ LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(),
+ MBB);
+ }
+
+ // Set the isKilled flag if we get new Kills in the THEN region.
+ for (auto *MI : OldVarInfo.Kills) {
+ if (Visited.contains(MI->getParent()))
+ MI->addRegisterKilled(Reg, TRI);
+ }
+}
+
+void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion(
+ Register Reg, Register NewReg, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+ // Transfer aliveBlocks from Reg to NewReg
+ for (auto *MBB : ElseBlocks) {
+ unsigned BBNum = MBB->getNumber();
+ if (OldVarInfo.AliveBlocks.test(BBNum)) {
+ NewVarInfo.AliveBlocks.set(BBNum);
+ LLVM_DEBUG(dbgs() << "Removing ALiveBlock " << printMBBReference(*MBB)
+ << '\n');
+ OldVarInfo.AliveBlocks.reset(BBNum);
+ }
+ }
+
+ // Transfer the possible Kills in ElseBlocks from Reg to NewReg
+ auto I = OldVarInfo.Kills.begin();
+ while (I != OldVarInfo.Kills.end()) {
+ if (ElseBlocks.contains((*I)->getParent())) {
+ NewVarInfo.Kills.push_back(*I);
+ I = OldVarInfo.Kills.erase(I);
+ } else {
+ ++I;
+ }
+ }
+}
+
+void SIOptimizeVGPRLiveRange::optimizeLiveRange(
+ Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow,
+ MachineBasicBlock *Endif,
+ SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+ // Insert a new PHI, marking the value from the THEN region being
+ // undef.
+ LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+ const auto *RC = MRI->getRegClass(Reg);
+ Register NewReg = MRI->createVirtualRegister(RC);
+ Register UndefReg = MRI->createVirtualRegister(RC);
+ MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(),
+ TII->get(TargetOpcode::PHI), NewReg);
+ for (auto *Pred : Flow->predecessors()) {
+ if (Pred == If)
+ PHI.addReg(Reg).addMBB(Pred);
+ else
+ PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+ }
+
+ // Replace all uses in the ELSE region or the PHIs in ENDIF block
+ for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) {
+ MachineOperand &O = *I;
+ // This is a little bit tricky, the setReg() will update the linked list,
+ // so we have to increment the iterator before setReg() to avoid skipping
+ // some uses.
+ ++I;
+ auto *UseMI = O.getParent();
+ auto *UseBlock = UseMI->getParent();
+ // Replace uses in Endif block
+ if (UseBlock == Endif) {
+ assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
+ O.setReg(NewReg);
+ continue;
+ }
+
+ // Replace uses in Else region
+ if (ElseBlocks.contains(UseBlock))
+ O.setReg(NewReg);
+ }
+
+ // The optimized Reg is not alive through Flow blocks anymore.
+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+ OldVarInfo.AliveBlocks.reset(Flow->getNumber());
+
+ updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks);
+ updateLiveRangeInThenRegion(Reg, If, Flow);
+}
+
+char SIOptimizeVGPRLiveRange::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+ "SI Optimize VGPR LiveRange", false, false)
+
+char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID;
+
+FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() {
+ return new SIOptimizeVGPRLiveRange();
+}
+
+bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
+ Loops = &getAnalysis<MachineLoopInfo>();
+ LV = &getAnalysis<LiveVariables>();
+ MRI = &MF.getRegInfo();
+
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ bool MadeChange = false;
+
+ // TODO: we need to think about the order of visiting the blocks to get
+ // optimal result for nesting if-else cases.
+ for (MachineBasicBlock &MBB : MF) {
+ for (auto &MI : MBB.terminators()) {
+ // Detect the if-else blocks
+ if (MI.getOpcode() == AMDGPU::SI_IF) {
+ MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
+ auto *Endif = getElseTarget(IfTarget);
+ if (!Endif)
+ continue;
+
+ SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
+ SmallVector<Register> CandidateRegs;
+
+ LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: "
+ << printMBBReference(MBB) << ' '
+ << printMBBReference(*IfTarget) << ' '
+ << printMBBReference(*Endif) << '\n');
+
+ // Collect all the blocks in the ELSE region
+ collectElseRegionBlocks(IfTarget, Endif, ElseBlocks);
+
+ // Collect the registers can be optimized
+ collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks,
+ CandidateRegs);
+ MadeChange |= !CandidateRegs.empty();
+ // Now we are safe to optimize.
+ for (auto Reg : CandidateRegs)
+ optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index db0329c5a050f..a899655cfd96e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -17,147 +17,149 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_cbranch_execz BB0_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v2, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v5, v4
+; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4
; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v5
-; CHECK-NEXT: v_cvt_f32_u32_e32 v7, v3
-; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v8
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
-; CHECK-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CHECK-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
-; CHECK-NEXT: v_trunc_f32_e32 v9, v9
-; CHECK-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
+; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3
+; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7
+; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; CHECK-NEXT: v_trunc_f32_e32 v6, v6
+; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5
; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v9
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v7, v8
-; CHECK-NEXT: v_mul_lo_u32 v12, v11, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v10, v9
-; CHECK-NEXT: v_mul_hi_u32 v15, v10, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v10, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5
+; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6
+; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5
+; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12
+; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10
+; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12
+; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v14
-; CHECK-NEXT: v_mul_lo_u32 v15, v6, v12
-; CHECK-NEXT: v_mul_hi_u32 v16, v6, v14
-; CHECK-NEXT: v_mul_hi_u32 v14, v9, v14
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v16, v9, v12
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT: v_mul_hi_u32 v15, v6, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v9, v12
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v15, vcc, v16, v15
; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CHECK-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
-; CHECK-NEXT: v_mul_lo_u32 v11, v11, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v10, v13
-; CHECK-NEXT: v_mul_lo_u32 v15, v10, v6
-; CHECK-NEXT: v_mul_hi_u32 v10, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5
+; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11
+; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5
+; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5
+; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13
+; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13
; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_mul_hi_u32 v12, v6, v15
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
-; CHECK-NEXT: v_mul_lo_u32 v11, v13, v15
-; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10
-; CHECK-NEXT: v_mul_hi_u32 v15, v13, v15
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT: v_mul_lo_u32 v12, v13, v10
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11
-; CHECK-NEXT: v_mul_hi_u32 v14, v6, v10
-; CHECK-NEXT: v_mul_hi_u32 v10, v13, v10
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9
+; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8
+; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8
+; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10
+; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6
-; CHECK-NEXT: v_mul_lo_u32 v11, v7, v9
-; CHECK-NEXT: v_mul_hi_u32 v12, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6
; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v12, v1, v9
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_mul_hi_u32 v11, v7, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6
-; CHECK-NEXT: v_mul_lo_u32 v11, v5, v9
-; CHECK-NEXT: v_mul_hi_u32 v13, v5, v6
-; CHECK-NEXT: v_mul_lo_u32 v12, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v3
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v2, v5
+; CHECK-NEXT: v_mul_lo_u32 v10, v2, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v3
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, 1, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v11
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v2, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_xor_b32_e32 v2, v7, v4
; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v8, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5
-; CHECK-NEXT: v_xor_b32_e32 v3, v3, v5
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
-; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -702,146 +704,148 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc
; CGP-NEXT: v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT: v_xor_b32_e32 v5, v5, v0
-; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v11, v5
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v9
-; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
-; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v8, v12
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v1
-; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; CGP-NEXT: v_mul_f32_e32 v13, 0x2f800000, v10
-; CGP-NEXT: v_trunc_f32_e32 v13, v13
-; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v13
+; CGP-NEXT: v_xor_b32_e32 v4, v4, v0
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v10
+; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v8, v8, v11
+; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v5
+; CGP-NEXT: v_trunc_f32_e32 v10, v10
+; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v13
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v11, v12
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v19, v14, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v14, v10
-; CGP-NEXT: v_xor_b32_e32 v9, v9, v12
+; CGP-NEXT: v_xor_b32_e32 v9, v9, v11
+; CGP-NEXT: v_mul_lo_u32 v14, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v15, v12, v10
+; CGP-NEXT: v_mul_hi_u32 v17, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v12, v5
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_mul_lo_u32 v15, v10, v16
+; CGP-NEXT: v_mul_lo_u32 v17, v5, v14
+; CGP-NEXT: v_mul_hi_u32 v18, v5, v16
+; CGP-NEXT: v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_mul_hi_u32 v17, v5, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
+; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v18
-; CGP-NEXT: v_mul_lo_u32 v19, v10, v16
-; CGP-NEXT: v_mul_hi_u32 v20, v10, v18
-; CGP-NEXT: v_mul_hi_u32 v18, v13, v18
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v20
; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v20, v13, v16
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT: v_mul_hi_u32 v19, v10, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v13, v16
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v20, v18
-; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT: v_addc_u32_e64 v17, s[4:5], v13, v16, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v15, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v14, v17
-; CGP-NEXT: v_mul_lo_u32 v19, v14, v10
-; CGP-NEXT: v_mul_hi_u32 v14, v14, v10
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15
+; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v12, v15
+; CGP-NEXT: v_mul_lo_u32 v17, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v12, v12, v5
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT: v_mul_hi_u32 v14, v5, v17
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v15, v17
+; CGP-NEXT: v_mul_lo_u32 v16, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v17, v15, v17
; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_mul_hi_u32 v16, v10, v19
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT: v_mul_lo_u32 v15, v17, v19
-; CGP-NEXT: v_mul_lo_u32 v18, v10, v14
-; CGP-NEXT: v_mul_hi_u32 v19, v17, v19
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v16, v17, v14
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15
-; CGP-NEXT: v_mul_hi_u32 v18, v10, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v17, v14
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v19, v18
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v12
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
+; CGP-NEXT: v_mul_hi_u32 v16, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v12
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v14, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_mul_lo_u32 v12, v9, v5
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v10
+; CGP-NEXT: v_mul_hi_u32 v14, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_mul_lo_u32 v14, v9, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v10
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v10
; CGP-NEXT: v_mul_hi_u32 v10, v9, v10
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v9, v13
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v15, v11, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_mul_lo_u32 v14, v5, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v1, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v1, v10
-; CGP-NEXT: v_mul_lo_u32 v16, v1, v10
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v16
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v9, v14, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v14
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v5
-; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v5
+; CGP-NEXT: v_mul_lo_u32 v13, v1, v10
+; CGP-NEXT: v_mul_hi_u32 v15, v1, v5
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v9, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4
+; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v1
; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v5
-; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v10
-; CGP-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4
+; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v5
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
+; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v16, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v9, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v12, v0
-; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v1, v9
-; CGP-NEXT: v_xor_b32_e32 v1, v5, v9
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v8, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v11, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v1, v5
+; CGP-NEXT: v_xor_b32_e32 v1, v4, v5
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr8
+; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -879,146 +883,148 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: ; %bb.5:
; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v4, vcc
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT: v_xor_b32_e32 v7, v7, v4
-; CGP-NEXT: v_cvt_f32_u32_e32 v8, v5
-; CGP-NEXT: v_cvt_f32_u32_e32 v9, v7
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
-; CGP-NEXT: v_rcp_iflag_f32_e32 v8, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v5
-; CGP-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v8
-; CGP-NEXT: v_trunc_f32_e32 v11, v11
-; CGP-NEXT: v_mac_f32_e32 v8, 0xcf800000, v11
+; CGP-NEXT: v_xor_b32_e32 v6, v6, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5
+; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6
+; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8
+; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7
+; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v9
+; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7
+; CGP-NEXT: v_trunc_f32_e32 v8, v8
+; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v9, v10
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v17, v12, v8
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v8
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v11, v7
+; CGP-NEXT: v_mul_lo_u32 v13, v10, v8
+; CGP-NEXT: v_mul_hi_u32 v15, v10, v7
+; CGP-NEXT: v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v16, v7, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v16, v8, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_mul_hi_u32 v15, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v16
-; CGP-NEXT: v_mul_lo_u32 v17, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v18, v8, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v16
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v18, v11, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT: v_mul_hi_u32 v17, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v14
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v13, v8
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v12, v12, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT: v_mul_lo_u32 v14, v10, v13
+; CGP-NEXT: v_mul_lo_u32 v15, v10, v7
+; CGP-NEXT: v_mul_hi_u32 v10, v10, v7
+; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v15
+; CGP-NEXT: v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v15, v13, v15
; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v17
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT: v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT: v_mul_lo_u32 v16, v8, v12
-; CGP-NEXT: v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v8, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v12, v13, v10
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v10
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v3, v7
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v8
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_mul_lo_u32 v12, v3, v8
-; CGP-NEXT: v_mul_lo_u32 v13, v9, v11
-; CGP-NEXT: v_mul_hi_u32 v14, v9, v8
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v8
; CGP-NEXT: v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v3, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v14, v8
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_mul_lo_u32 v12, v7, v8
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v8
-; CGP-NEXT: v_mul_lo_u32 v14, v5, v8
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v3, v12, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v12
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v7
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_mul_lo_u32 v10, v6, v7
+; CGP-NEXT: v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT: v_mul_hi_u32 v13, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v7
-; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v8
-; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v5, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v7, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v9, v4
; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v10, v4
-; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT: v_xor_b32_e32 v5, v5, v7
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v7
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v5
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v5
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -2516,146 +2522,148 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v2, vcc
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT: v_xor_b32_e32 v5, v5, v2
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CHECK-NEXT: v_cvt_f32_u32_e32 v7, v5
-; CHECK-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v8
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
-; CHECK-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CHECK-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
-; CHECK-NEXT: v_trunc_f32_e32 v9, v9
-; CHECK-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
+; CHECK-NEXT: v_xor_b32_e32 v4, v4, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v4
+; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v4, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7
+; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; CHECK-NEXT: v_trunc_f32_e32 v6, v6
+; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5
; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v9
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v7, v8
-; CHECK-NEXT: v_mul_lo_u32 v12, v11, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v10, v9
-; CHECK-NEXT: v_mul_hi_u32 v15, v10, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v10, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v8
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5
+; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6
+; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5
+; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12
+; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10
+; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12
+; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v14
-; CHECK-NEXT: v_mul_lo_u32 v15, v6, v12
-; CHECK-NEXT: v_mul_hi_u32 v16, v6, v14
-; CHECK-NEXT: v_mul_hi_u32 v14, v9, v14
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v16, v9, v12
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT: v_mul_hi_u32 v15, v6, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v9, v12
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v15, vcc, v16, v15
; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CHECK-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
-; CHECK-NEXT: v_mul_lo_u32 v11, v11, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v10, v13
-; CHECK-NEXT: v_mul_lo_u32 v15, v10, v6
-; CHECK-NEXT: v_mul_hi_u32 v10, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5
+; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11
+; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5
+; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5
+; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13
+; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13
; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_mul_hi_u32 v12, v6, v15
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
-; CHECK-NEXT: v_mul_lo_u32 v11, v13, v15
-; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10
-; CHECK-NEXT: v_mul_hi_u32 v15, v13, v15
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT: v_mul_lo_u32 v12, v13, v10
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11
-; CHECK-NEXT: v_mul_hi_u32 v14, v6, v10
-; CHECK-NEXT: v_mul_hi_u32 v10, v13, v10
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9
+; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8
+; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8
+; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10
+; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6
-; CHECK-NEXT: v_mul_lo_u32 v11, v7, v9
-; CHECK-NEXT: v_mul_hi_u32 v12, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6
; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v12, v1, v9
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_mul_hi_u32 v11, v7, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6
-; CHECK-NEXT: v_mul_lo_u32 v11, v3, v9
-; CHECK-NEXT: v_mul_hi_u32 v13, v3, v6
-; CHECK-NEXT: v_mul_lo_u32 v12, v3, v6
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5
+; CHECK-NEXT: v_mul_lo_u32 v9, v3, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v3, v5
+; CHECK-NEXT: v_mul_lo_u32 v10, v3, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, 1, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v13, v3, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v11
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v7, v2
; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v8, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v5
-; CHECK-NEXT: v_xor_b32_e32 v3, v3, v5
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v1, v5
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v3
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -3008,142 +3016,144 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v1, v0
; CGP-NEXT: v_xor_b32_e32 v4, v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v11, v4
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v11
+; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v12
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v1
+; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v5, v11
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT: v_mul_f32_e32 v13, 0x2f800000, v6
-; CGP-NEXT: v_trunc_f32_e32 v13, v13
-; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v13
+; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6
+; CGP-NEXT: v_trunc_f32_e32 v10, v10
+; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v13
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v11, v12
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v6
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v19, v14, v6
-; CGP-NEXT: v_mul_lo_u32 v18, v14, v6
-; CGP-NEXT: v_xor_b32_e32 v7, v7, v12
+; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10
+; CGP-NEXT: v_xor_b32_e32 v7, v7, v11
+; CGP-NEXT: v_mul_lo_u32 v14, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v15, v12, v10
+; CGP-NEXT: v_mul_hi_u32 v17, v12, v6
+; CGP-NEXT: v_mul_lo_u32 v16, v12, v6
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_mul_lo_u32 v15, v10, v16
+; CGP-NEXT: v_mul_lo_u32 v17, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v18, v6, v16
+; CGP-NEXT: v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_mul_hi_u32 v17, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
+; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v18
-; CGP-NEXT: v_mul_lo_u32 v19, v6, v16
-; CGP-NEXT: v_mul_hi_u32 v20, v6, v18
-; CGP-NEXT: v_mul_hi_u32 v18, v13, v18
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v20
; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v20, v13, v16
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT: v_mul_hi_u32 v19, v6, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v13, v16
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v20, v18
-; CGP-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v17
-; CGP-NEXT: v_addc_u32_e64 v17, s[4:5], v13, v16, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v15, v6
-; CGP-NEXT: v_mul_lo_u32 v18, v14, v17
-; CGP-NEXT: v_mul_lo_u32 v19, v14, v6
-; CGP-NEXT: v_mul_hi_u32 v14, v14, v6
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
+; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v16, v12, v15
+; CGP-NEXT: v_mul_lo_u32 v17, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v12, v12, v6
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT: v_mul_hi_u32 v14, v6, v17
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v15, v17
+; CGP-NEXT: v_mul_lo_u32 v16, v6, v12
+; CGP-NEXT: v_mul_hi_u32 v17, v15, v17
; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_mul_hi_u32 v16, v6, v19
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT: v_mul_lo_u32 v15, v17, v19
-; CGP-NEXT: v_mul_lo_u32 v18, v6, v14
-; CGP-NEXT: v_mul_hi_u32 v19, v17, v19
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v16
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v16, v17, v14
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v18, v15
-; CGP-NEXT: v_mul_hi_u32 v18, v6, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v17, v14
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v19
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v16, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v18, s[4:5], v19, v18
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v18, v16
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v12
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
+; CGP-NEXT: v_mul_hi_u32 v16, v6, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v12
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v14, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v6
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v6
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_mul_lo_u32 v12, v7, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT: v_mul_hi_u32 v14, v5, v6
; CGP-NEXT: v_mul_hi_u32 v6, v7, v6
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v7, v13
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v15, v11, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v16, v6
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v5, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v7, v10
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v6
-; CGP-NEXT: v_mul_lo_u32 v15, v1, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v1, v6
-; CGP-NEXT: v_mul_lo_u32 v16, v1, v6
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v16
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v7, v14, vcc
-; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v14
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v1, v10
+; CGP-NEXT: v_mul_hi_u32 v15, v1, v6
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v6
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v7, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4
; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v1
; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v15, v4
-; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v6
-; CGP-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4
+; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v6
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
+; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v16, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v7, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v6, v12, v0
-; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v1, v6
-; CGP-NEXT: v_xor_b32_e32 v1, v4, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v11, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v1, v5
+; CGP-NEXT: v_xor_b32_e32 v1, v4, v5
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr5
+; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -3185,142 +3195,144 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v5, v5, v4
; CGP-NEXT: v_xor_b32_e32 v6, v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT: v_cvt_f32_u32_e32 v9, v6
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v9
+; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6
+; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8
; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v5
+; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v9
; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
-; CGP-NEXT: v_trunc_f32_e32 v11, v11
-; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11
+; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7
+; CGP-NEXT: v_trunc_f32_e32 v8, v8
+; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v6, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v9, v10
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v7
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v17, v12, v7
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v7
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v11, v7
+; CGP-NEXT: v_mul_lo_u32 v13, v10, v8
+; CGP-NEXT: v_mul_hi_u32 v15, v10, v7
+; CGP-NEXT: v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v16, v7, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v16, v8, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_mul_hi_u32 v15, v7, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v16
-; CGP-NEXT: v_mul_lo_u32 v17, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v18, v7, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v16
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v18, v11, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT: v_mul_hi_u32 v17, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v14
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v13, v7
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v7
-; CGP-NEXT: v_mul_hi_u32 v12, v12, v7
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT: v_mul_lo_u32 v14, v10, v13
+; CGP-NEXT: v_mul_lo_u32 v15, v10, v7
+; CGP-NEXT: v_mul_hi_u32 v10, v10, v7
+; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v15
+; CGP-NEXT: v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v15, v13, v15
; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v17
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT: v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT: v_mul_lo_u32 v16, v7, v12
-; CGP-NEXT: v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v7, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v12, v13, v10
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v10
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v3, v7
-; CGP-NEXT: v_mul_lo_u32 v13, v9, v11
-; CGP-NEXT: v_mul_hi_u32 v14, v9, v7
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v3, v7
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v8
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v7
; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v3, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v11
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_mul_lo_u32 v12, v6, v7
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v7
-; CGP-NEXT: v_mul_lo_u32 v14, v5, v7
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v3, v12, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v12
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_mul_lo_u32 v10, v6, v7
+; CGP-NEXT: v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT: v_mul_hi_u32 v13, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v6
-; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v7
-; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v11, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v5, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v14, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v6, v10, v4
-; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT: v_xor_b32_e32 v5, v5, v6
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v6
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v9, v4
+; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v5
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v5
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 840653d8c4c54..2ae38a64fe34a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -17,145 +17,147 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_cbranch_execz BB0_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v2, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT: v_xor_b32_e32 v5, v5, v4
-; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v5
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
+; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4
+; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4
-; CHECK-NEXT: v_trunc_f32_e32 v8, v8
-; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
+; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT: v_trunc_f32_e32 v5, v5
+; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v8
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v6, v6, v7
-; CHECK-NEXT: v_mul_lo_u32 v11, v10, v4
-; CHECK-NEXT: v_mul_lo_u32 v12, v9, v8
-; CHECK-NEXT: v_mul_hi_u32 v14, v9, v4
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v4
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4
+; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9
+; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v13
-; CHECK-NEXT: v_mul_lo_u32 v14, v4, v11
-; CHECK-NEXT: v_mul_hi_u32 v15, v4, v13
-; CHECK-NEXT: v_mul_hi_u32 v13, v8, v13
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v15, v8, v11
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CHECK-NEXT: v_mul_hi_u32 v14, v4, v11
-; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; CHECK-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
-; CHECK-NEXT: v_mul_lo_u32 v10, v10, v4
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v12
-; CHECK-NEXT: v_mul_lo_u32 v14, v9, v4
-; CHECK-NEXT: v_mul_hi_u32 v9, v9, v4
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10
+; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12
+; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12
; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT: v_mul_hi_u32 v11, v4, v14
-; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT: v_mul_lo_u32 v10, v12, v14
-; CHECK-NEXT: v_mul_lo_u32 v13, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v14, v12, v14
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT: v_mul_lo_u32 v11, v12, v9
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
-; CHECK-NEXT: v_mul_hi_u32 v13, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v12, v9
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8
+; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v4
-; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v11, v6, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9
+; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4
+; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4
; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT: v_mul_hi_u32 v10, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_mul_lo_u32 v9, v3, v4
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v8
-; CHECK-NEXT: v_mul_lo_u32 v10, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v4, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v4, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_mul_lo_u32 v7, v3, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5
+; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v6, v5
-; CHECK-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
+; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v3
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v9, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v3, v7
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v3, v7
-; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v6, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -690,144 +692,146 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc
; CGP-NEXT: v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT: v_xor_b32_e32 v0, v5, v0
-; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1
-; CGP-NEXT: v_cvt_f32_u32_e32 v10, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v10
-; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v8, v11
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v1
-; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT: v_mul_f32_e32 v12, 0x2f800000, v5
-; CGP-NEXT: v_trunc_f32_e32 v12, v12
-; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v12
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v0, vcc
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v11
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v5
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v18, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v5
-; CGP-NEXT: v_xor_b32_e32 v9, v9, v11
+; CGP-NEXT: v_xor_b32_e32 v0, v4, v0
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v0
+; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9
+; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v10
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc
+; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4
+; CGP-NEXT: v_trunc_f32_e32 v9, v9
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v5, v10
+; CGP-NEXT: v_mul_lo_u32 v13, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v14, v11, v9
+; CGP-NEXT: v_mul_hi_u32 v16, v11, v4
+; CGP-NEXT: v_mul_lo_u32 v15, v11, v4
+; CGP-NEXT: v_xor_b32_e32 v8, v8, v10
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_mul_lo_u32 v14, v9, v15
+; CGP-NEXT: v_mul_lo_u32 v16, v4, v13
+; CGP-NEXT: v_mul_hi_u32 v17, v4, v15
+; CGP-NEXT: v_mul_hi_u32 v15, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v17, v9, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v16, v4, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v17
-; CGP-NEXT: v_mul_lo_u32 v18, v5, v15
-; CGP-NEXT: v_mul_hi_u32 v19, v5, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v12, v17
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v19, v12, v15
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT: v_mul_hi_u32 v18, v5, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v12, v15
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT: v_addc_u32_e64 v16, s[4:5], v12, v15, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v14, v5
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v16
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v13, v5
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v13, vcc
+; CGP-NEXT: v_mul_lo_u32 v12, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v15, v11, v14
+; CGP-NEXT: v_mul_lo_u32 v16, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v11, v4
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v4, v16
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v14, v16
+; CGP-NEXT: v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v16, v14, v16
; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v18
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT: v_mul_lo_u32 v14, v16, v18
-; CGP-NEXT: v_mul_lo_u32 v17, v5, v13
-; CGP-NEXT: v_mul_hi_u32 v18, v16, v18
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v15, v16, v13
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14
-; CGP-NEXT: v_mul_hi_u32 v17, v5, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v16, v13
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v11
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12
+; CGP-NEXT: v_mul_hi_u32 v15, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v11
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v13, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v9, v5
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v12
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v9, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v12
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v15, v5
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v8, v4
+; CGP-NEXT: v_mul_lo_u32 v12, v5, v9
+; CGP-NEXT: v_mul_hi_u32 v13, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v8, v4
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v12, v5, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v0, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v12
-; CGP-NEXT: v_mul_lo_u32 v14, v1, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_sub_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v9, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v9, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_mul_lo_u32 v11, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v9, v1, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v8, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v0
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5]
+; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1
+; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[4:5]
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, v10, v1
-; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v5, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v0
-; CGP-NEXT: v_subb_u32_e32 v0, vcc, v5, v0, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT: v_xor_b32_e32 v5, v0, v11
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v11
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v5, v11, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT: v_xor_b32_e32 v4, v0, v10
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc
+; CGP-NEXT: ; implicit-def: $vgpr8
+; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -863,144 +867,146 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: ; %bb.5:
; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v4, vcc
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT: v_xor_b32_e32 v4, v7, v4
-; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4
-; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3
-; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8
-; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v2, v9
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v5
-; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v7
-; CGP-NEXT: v_trunc_f32_e32 v10, v10
-; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v10
+; CGP-NEXT: v_xor_b32_e32 v4, v6, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5
+; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
+; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
+; CGP-NEXT: v_trunc_f32_e32 v7, v7
+; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
+; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v7
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v7
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v7
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v10, v6
+; CGP-NEXT: v_mul_lo_u32 v12, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v14, v9, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_mul_lo_u32 v12, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v6, v11
+; CGP-NEXT: v_mul_hi_u32 v15, v6, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_mul_hi_u32 v14, v6, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v15
-; CGP-NEXT: v_mul_lo_u32 v16, v7, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v7, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v17, v10, v13
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v16, v7, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v12, v7
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT: v_mul_lo_u32 v16, v11, v7
-; CGP-NEXT: v_mul_hi_u32 v11, v11, v7
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v12
+; CGP-NEXT: v_mul_lo_u32 v14, v9, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v9, v6
+; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v14
+; CGP-NEXT: v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v14, v12, v14
; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v16
-; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT: v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT: v_mul_lo_u32 v15, v7, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v13, v14, v11
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT: v_mul_hi_u32 v15, v7, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v14, v11
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
+; CGP-NEXT: v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v9
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v3, v6
+; CGP-NEXT: v_mul_lo_u32 v10, v2, v7
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_mul_lo_u32 v11, v3, v7
-; CGP-NEXT: v_mul_lo_u32 v12, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v13, v8, v7
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v2, v7
; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v3, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v3, v10
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT: v_mul_lo_u32 v11, v4, v7
-; CGP-NEXT: v_mul_lo_u32 v10, v5, v10
-; CGP-NEXT: v_mul_lo_u32 v12, v5, v7
-; CGP-NEXT: v_mul_hi_u32 v7, v5, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_mul_lo_u32 v9, v4, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v10, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
+; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5
+; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v5
-; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v4, v4, v9
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -2480,144 +2486,146 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v2, vcc
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT: v_xor_b32_e32 v2, v5, v2
-; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2
-; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CHECK-NEXT: v_mul_f32_e32 v8, 0x2f800000, v5
-; CHECK-NEXT: v_trunc_f32_e32 v8, v8
-; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v8
+; CHECK-NEXT: v_xor_b32_e32 v2, v4, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v3
+; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
+; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT: v_trunc_f32_e32 v5, v5
+; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v8
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc
-; CHECK-NEXT: v_xor_b32_e32 v6, v6, v7
-; CHECK-NEXT: v_mul_lo_u32 v11, v10, v5
-; CHECK-NEXT: v_mul_lo_u32 v12, v9, v8
-; CHECK-NEXT: v_mul_hi_u32 v14, v9, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v5
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4
+; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9
+; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v13
-; CHECK-NEXT: v_mul_lo_u32 v14, v5, v11
-; CHECK-NEXT: v_mul_hi_u32 v15, v5, v13
-; CHECK-NEXT: v_mul_hi_u32 v13, v8, v13
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v15, v8, v11
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CHECK-NEXT: v_mul_hi_u32 v14, v5, v11
-; CHECK-NEXT: v_mul_hi_u32 v11, v8, v11
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
-; CHECK-NEXT: v_mul_lo_u32 v10, v10, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v12
-; CHECK-NEXT: v_mul_lo_u32 v14, v9, v5
-; CHECK-NEXT: v_mul_hi_u32 v9, v9, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10
+; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
+; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12
+; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12
; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v14
-; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT: v_mul_lo_u32 v10, v12, v14
-; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9
-; CHECK-NEXT: v_mul_hi_u32 v14, v12, v14
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT: v_mul_lo_u32 v11, v12, v9
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
-; CHECK-NEXT: v_mul_hi_u32 v13, v5, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v12, v9
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
-; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8
+; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9
+; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4
+; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v11, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5
; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT: v_mul_hi_u32 v10, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_mul_lo_u32 v9, v2, v5
-; CHECK-NEXT: v_mul_lo_u32 v8, v3, v8
-; CHECK-NEXT: v_mul_lo_u32 v10, v3, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v5, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v3, v5
+; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4
+; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v3
+; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, v6, v3
-; CHECK-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v2, v2, v7
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v6
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v6, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -2965,139 +2973,141 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v0, v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7
+; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_xor_b32_e32 v5, v5, v10
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v5, v11
-; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v1
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc
+; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT: v_mul_f32_e32 v12, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v12, v12
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
+; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT: v_trunc_f32_e32 v7, v7
+; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v12
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v0, vcc
-; CGP-NEXT: v_xor_b32_e32 v6, v6, v11
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v4
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v12
-; CGP-NEXT: v_mul_hi_u32 v18, v13, v4
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v4
-; CGP-NEXT: v_xor_b32_e32 v7, v7, v11
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT: v_xor_b32_e32 v6, v6, v10
+; CGP-NEXT: v_mul_lo_u32 v13, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v14, v11, v7
+; CGP-NEXT: v_mul_hi_u32 v16, v11, v4
+; CGP-NEXT: v_mul_lo_u32 v15, v11, v4
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_mul_lo_u32 v14, v7, v15
+; CGP-NEXT: v_mul_lo_u32 v16, v4, v13
+; CGP-NEXT: v_mul_hi_u32 v17, v4, v15
+; CGP-NEXT: v_mul_hi_u32 v15, v7, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v17, v7, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v16, v4, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18
-; CGP-NEXT: v_mul_lo_u32 v16, v12, v17
-; CGP-NEXT: v_mul_lo_u32 v18, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v19, v4, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v12, v17
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v19
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v19, v12, v15
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT: v_mul_hi_u32 v18, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v12, v15
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT: v_addc_u32_e64 v16, s[4:5], v12, v15, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v14, v4
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v16
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v13, v4
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v7, v13, vcc
+; CGP-NEXT: v_mul_lo_u32 v12, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v15, v11, v14
+; CGP-NEXT: v_mul_lo_u32 v16, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v11, v4
+; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v4, v16
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v14, v16
+; CGP-NEXT: v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v16, v14, v16
; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_mul_hi_u32 v15, v4, v18
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT: v_mul_lo_u32 v14, v16, v18
-; CGP-NEXT: v_mul_lo_u32 v17, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v18, v16, v18
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v15, v16, v13
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14
-; CGP-NEXT: v_mul_hi_u32 v17, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v16, v13
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v17, s[4:5], v18, v17
-; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v15, v14
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v17, v15
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v11
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12
+; CGP-NEXT: v_mul_hi_u32 v15, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v11
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v13, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v7, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v15, v6, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v7, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v7, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v13, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v6, v7
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v12, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v0, v4
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v12
-; CGP-NEXT: v_mul_lo_u32 v14, v1, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v7, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_mul_lo_u32 v11, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v7
+; CGP-NEXT: v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5]
+; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1
+; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, v6, v1
-; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v0
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v0, v14, v0, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT: v_xor_b32_e32 v4, v0, v11
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v11
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v11, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT: v_xor_b32_e32 v4, v0, v10
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc
+; CGP-NEXT: ; implicit-def: $vgpr5
+; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -3138,139 +3148,141 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v4, v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5
; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4
-; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc
; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v2, v9
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v5
+; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v8
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6
-; CGP-NEXT: v_trunc_f32_e32 v10, v10
-; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
+; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
+; CGP-NEXT: v_trunc_f32_e32 v7, v7
+; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v6
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v16, v11, v6
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v6
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v10, v6
+; CGP-NEXT: v_mul_lo_u32 v12, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v14, v9, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_mul_lo_u32 v12, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v6, v11
+; CGP-NEXT: v_mul_hi_u32 v15, v6, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_mul_hi_u32 v14, v6, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v15
-; CGP-NEXT: v_mul_lo_u32 v16, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v6, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v17, v10, v13
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v16, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v12, v6
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT: v_mul_lo_u32 v16, v11, v6
-; CGP-NEXT: v_mul_hi_u32 v11, v11, v6
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v12
+; CGP-NEXT: v_mul_lo_u32 v14, v9, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v9, v6
+; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v14
+; CGP-NEXT: v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v14, v12, v14
; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_mul_hi_u32 v13, v6, v16
-; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT: v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT: v_mul_lo_u32 v13, v14, v11
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT: v_mul_hi_u32 v15, v6, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v14, v11
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10
+; CGP-NEXT: v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v9
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v3, v6
-; CGP-NEXT: v_mul_lo_u32 v12, v7, v10
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v6
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v3, v6
+; CGP-NEXT: v_mul_lo_u32 v10, v2, v7
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v6
; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v3, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v12, v7, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v3, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v3, v7
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v2, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT: v_mul_lo_u32 v11, v4, v6
-; CGP-NEXT: v_mul_lo_u32 v10, v5, v10
-; CGP-NEXT: v_mul_lo_u32 v12, v5, v6
-; CGP-NEXT: v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v3, v6, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_mul_lo_u32 v9, v4, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v10, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v7, v5
-; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
+; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5
+; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v4, v4, v9
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 1aadea0b9b835..4ee838f942b39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -119,30 +119,32 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10
; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v7
-; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v2
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -739,30 +741,32 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14
; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v8, v11
-; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v9, v10, vcc
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v9, v10, vcc
; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v5
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v5, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v13, v5
-; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
+; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4
; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v12, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr8
+; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -901,30 +905,32 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12
; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v9
-; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v8, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v8, vcc
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v8
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v7
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v7
-; CGP-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7
+; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v6
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v10, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -2399,30 +2405,32 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10
; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v7
-; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v13, v6, vcc
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v7, v4
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -2842,30 +2850,32 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14
; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v5, v6
-; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v7, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v13, v11
-; CGP-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v11
+; CGP-NEXT: v_cndmask_b32_e32 v6, v13, v7, vcc
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v10
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr5
+; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
@@ -3004,30 +3014,32 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12
; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v7
-; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v9
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v9
-; CGP-NEXT: v_cndmask_b32_e32 v6, v15, v6, vcc
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v8
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8
+; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
-; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v10, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 954022fb71c04..580dc2f4b81d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -115,33 +115,35 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v7
-; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v4, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v5, v2
-; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v7, v2
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: BB0_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -728,32 +730,34 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v11
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v9, v0, vcc
+; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v0, vcc
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, v1, v4
-; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc
+; CGP-NEXT: v_sub_i32_e32 v10, vcc, v1, v4
+; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT: v_sub_i32_e32 v15, vcc, v11, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v13, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
+; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v11, v12, v0, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v0, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
+; CGP-NEXT: ; implicit-def: $vgpr8
+; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: BB2_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -886,33 +890,35 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mul_lo_u32 v5, v6, v5
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v2, v9
-; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v6
-; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v6
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, v9, v6
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v7
-; CGP-NEXT: v_cndmask_b32_e32 v7, v12, v11, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7
+; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v13, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: BB2_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -1755,33 +1761,35 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v7
-; CHECK-NEXT: v_subb_u32_e64 v6, s[4:5], v1, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc
; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v4
-; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v4
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v7, v4
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v6, v4
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v8, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5
+; CHECK-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
; CHECK-NEXT: BB7_2: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -2197,29 +2205,31 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v0, vcc
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v7, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v10
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v10
-; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v10
+; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v11
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT: v_sub_i32_e32 v15, vcc, v7, v10
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_sub_i32_e32 v10, vcc, v6, v10
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v11
-; CGP-NEXT: v_cndmask_b32_e32 v11, v14, v13, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v11, v12, v0, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
+; CGP-NEXT: ; implicit-def: $vgpr5
+; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11
; CGP-NEXT: BB8_2: ; %Flow2
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -2352,33 +2362,35 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_mul_lo_u32 v5, v8, v5
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v2, v7
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
-; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v8
-; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v8
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, v2, v8
+; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v8
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v9
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, v7, v8
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v6, v8
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9
-; CGP-NEXT: v_cndmask_b32_e32 v9, v12, v11, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT: ; implicit-def: $vgpr2
+; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9
; CGP-NEXT: BB8_6: ; %Flow
; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; CGP-NEXT: s_xor_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index d10d911aeac07..ffba5af091758 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -8,135 +8,136 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-LABEL: sdiv64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: v_or_b32_e32 v4, v1, v5
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz BB0_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v4, v4, v3
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mov_b32_e32 v16, 0
-; GFX9-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
-; GFX9-NEXT: v_rcp_f32_e32 v6, v6
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3
+; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v15, 0
-; GFX9-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GFX9-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
-; GFX9-NEXT: v_trunc_f32_e32 v7, v7
-; GFX9-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
+; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; GFX9-NEXT: v_trunc_f32_e32 v6, v6
+; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_mul_lo_u32 v10, v9, v6
-; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT: v_mul_lo_u32 v12, v8, v7
-; GFX9-NEXT: v_mul_lo_u32 v13, v8, v6
-; GFX9-NEXT: v_add3_u32 v10, v11, v12, v10
-; GFX9-NEXT: v_mul_lo_u32 v12, v6, v10
-; GFX9-NEXT: v_mul_hi_u32 v14, v6, v13
-; GFX9-NEXT: v_mul_hi_u32 v11, v6, v10
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12
-; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc
-; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14
-; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10
-; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT: v_add_co_u32_e64 v6, s[4:5], v6, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5]
-; GFX9-NEXT: v_mul_lo_u32 v12, v8, v10
-; GFX9-NEXT: v_mul_hi_u32 v13, v8, v6
-; GFX9-NEXT: v_mul_lo_u32 v9, v9, v6
-; GFX9-NEXT: v_mul_lo_u32 v8, v8, v6
-; GFX9-NEXT: v_add_u32_e32 v7, v7, v11
-; GFX9-NEXT: v_add3_u32 v9, v13, v12, v9
-; GFX9-NEXT: v_mul_lo_u32 v12, v6, v9
-; GFX9-NEXT: v_mul_hi_u32 v13, v6, v8
-; GFX9-NEXT: v_mul_hi_u32 v14, v6, v9
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8
-; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8
-; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8
-; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9
-; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v12, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v16, v8, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v8
-; GFX9-NEXT: v_xor_b32_e32 v9, v9, v8
-; GFX9-NEXT: v_mul_lo_u32 v10, v9, v7
-; GFX9-NEXT: v_mul_hi_u32 v11, v9, v6
+; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5
+; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5
+; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6
+; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5
+; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9
+; GFX9-NEXT: v_mul_hi_u32 v13, v5, v12
+; GFX9-NEXT: v_mul_hi_u32 v10, v5, v9
+; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9
+; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11
+; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12
+; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5
+; GFX9-NEXT: v_add_u32_e32 v6, v6, v10
+; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8
+; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8
+; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7
+; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8
; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT: v_mul_lo_u32 v12, v1, v6
-; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6
-; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7
-; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v6, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v10, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v5, v6
-; GFX9-NEXT: v_mul_lo_u32 v11, v4, v7
-; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6
-; GFX9-NEXT: v_mul_lo_u32 v13, v4, v6
-; GFX9-NEXT: v_add3_u32 v10, v12, v11, v10
-; GFX9-NEXT: v_sub_u32_e32 v11, v1, v10
-; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v13
-; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v4
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v13, v12, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 2, v6
-; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_add_co_u32_e64 v14, s[4:5], 1, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v14, v12, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v5, v8, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v11, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, v4, v5
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13
+; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT: v_mul_hi_u32 v10, v0, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v15, v10, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v1, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT: v_mul_hi_u32 v11, v1, v6
+; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v8, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v6
+; GFX9-NEXT: v_mul_hi_u32 v10, v2, v5
+; GFX9-NEXT: v_mul_lo_u32 v11, v2, v5
+; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v9, v1, v8
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v11
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v11, v10, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 2, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 1, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v12, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v11, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v7, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v1, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB0_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -144,6 +145,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -156,16 +158,15 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v3, 1, v1
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc
; GFX9-NEXT: BB0_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%d = sdiv i64 %a, %b
ret i64 %d
@@ -261,33 +262,35 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4
; GFX9-NEXT: v_add3_u32 v6, v8, v7, v6
; GFX9-NEXT: v_sub_u32_e32 v7, v1, v6
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v9
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9
; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v8, v2
+; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v9, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], 2, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v4
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[4:5], 1, v4
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v12, v10, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v11, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v10, v8, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB1_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -326,133 +329,134 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-LABEL: srem64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, v3
-; GFX9-NEXT: v_or_b32_e32 v4, v1, v5
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz BB2_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT: v_xor_b32_e32 v3, v4, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
-; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
-; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
-; GFX9-NEXT: v_trunc_f32_e32 v6, v6
-; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
+; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX9-NEXT: v_trunc_f32_e32 v5, v5
+; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT: v_mul_lo_u32 v9, v8, v4
-; GFX9-NEXT: v_mul_hi_u32 v10, v7, v4
-; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6
-; GFX9-NEXT: v_mul_lo_u32 v12, v7, v4
-; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9
-; GFX9-NEXT: v_mul_lo_u32 v11, v4, v9
-; GFX9-NEXT: v_mul_hi_u32 v13, v4, v12
-; GFX9-NEXT: v_mul_hi_u32 v10, v4, v9
-; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9
-; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11
-; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12
-; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
-; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9
-; GFX9-NEXT: v_mul_hi_u32 v12, v7, v4
-; GFX9-NEXT: v_mul_lo_u32 v8, v8, v4
+; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_lo_u32 v8, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v9, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v10, v6, v5
+; GFX9-NEXT: v_mul_lo_u32 v11, v6, v4
+; GFX9-NEXT: v_add3_u32 v8, v9, v10, v8
+; GFX9-NEXT: v_mul_lo_u32 v10, v4, v8
+; GFX9-NEXT: v_mul_hi_u32 v12, v4, v11
+; GFX9-NEXT: v_mul_hi_u32 v9, v4, v8
+; GFX9-NEXT: v_mul_hi_u32 v15, v5, v8
+; GFX9-NEXT: v_mul_lo_u32 v8, v5, v8
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10
+; GFX9-NEXT: v_mul_lo_u32 v12, v5, v11
+; GFX9-NEXT: v_mul_hi_u32 v11, v5, v11
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v9, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v13, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v14, v10, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT: v_mul_hi_u32 v11, v6, v4
; GFX9-NEXT: v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT: v_add_u32_e32 v6, v6, v10
-; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8
-; GFX9-NEXT: v_mul_lo_u32 v11, v4, v8
-; GFX9-NEXT: v_mul_hi_u32 v12, v4, v7
-; GFX9-NEXT: v_mul_hi_u32 v16, v4, v8
-; GFX9-NEXT: v_mul_hi_u32 v13, v9, v8
-; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8
-; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11
-; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v11, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v7
-; GFX9-NEXT: v_xor_b32_e32 v8, v8, v7
-; GFX9-NEXT: v_mul_lo_u32 v9, v8, v6
-; GFX9-NEXT: v_mul_hi_u32 v10, v8, v4
+; GFX9-NEXT: v_mul_lo_u32 v6, v6, v4
+; GFX9-NEXT: v_add_u32_e32 v5, v5, v9
+; GFX9-NEXT: v_add3_u32 v7, v11, v10, v7
+; GFX9-NEXT: v_mul_lo_u32 v12, v4, v7
+; GFX9-NEXT: v_mul_hi_u32 v15, v4, v6
+; GFX9-NEXT: v_mul_hi_u32 v16, v4, v7
; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT: v_mul_lo_u32 v11, v1, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT: v_mul_hi_u32 v12, v1, v6
-; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v12, v14, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v12
+; GFX9-NEXT: v_mul_hi_u32 v10, v8, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v14, v16, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v13, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v8, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v9, vcc
-; GFX9-NEXT: v_mul_lo_u32 v9, v5, v4
-; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v6
-; GFX9-NEXT: v_mul_lo_u32 v4, v3, v4
-; GFX9-NEXT: v_add3_u32 v6, v10, v6, v9
-; GFX9-NEXT: v_sub_u32_e32 v9, v1, v6
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v9, v5, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v3
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v5
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7]
-; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
-; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, v3, v7
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
-; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6
+; GFX9-NEXT: v_mul_lo_u32 v7, v0, v5
+; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v9, vcc
+; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT: v_mul_hi_u32 v10, v1, v5
+; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v13, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v7, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v3, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v2, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4
+; GFX9-NEXT: v_add3_u32 v5, v8, v5, v7
+; GFX9-NEXT: v_sub_u32_e32 v7, v1, v5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v7, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v4, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v6
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v6, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB2_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -460,7 +464,7 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -475,11 +479,11 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_sub_u32_e32 v1, v0, v2
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
; GFX9-NEXT: BB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%d = srem i64 %a, %b
ret i64 %d
@@ -575,32 +579,34 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4
; GFX9-NEXT: v_add3_u32 v5, v7, v5, v6
; GFX9-NEXT: v_sub_u32_e32 v6, v1, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v4, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v6, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, s[4:5]
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
-; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT: v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v6, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v6, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v10, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB3_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -762,147 +768,148 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-LABEL: sdivrem64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-NEXT: v_or_b32_e32 v4, v1, v7
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT: ; implicit-def: $vgpr5_vgpr6
-; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT: v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz BB8_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v7
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, v4, v3
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v5
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_mov_b32_e32 v16, 0
-; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v7
-; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT: v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, v3
+; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, 0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v15, 0
-; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4
-; GFX9-NEXT: v_trunc_f32_e32 v7, v7
-; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT: v_mul_lo_u32 v10, v9, v4
-; GFX9-NEXT: v_mul_hi_u32 v11, v8, v4
-; GFX9-NEXT: v_mul_lo_u32 v12, v8, v7
-; GFX9-NEXT: v_mul_lo_u32 v13, v8, v4
-; GFX9-NEXT: v_add3_u32 v10, v11, v12, v10
-; GFX9-NEXT: v_mul_lo_u32 v12, v4, v10
-; GFX9-NEXT: v_mul_hi_u32 v14, v4, v13
-; GFX9-NEXT: v_mul_hi_u32 v11, v4, v10
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12
-; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc
-; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14
-; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10
-; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5]
-; GFX9-NEXT: v_mul_lo_u32 v12, v8, v10
-; GFX9-NEXT: v_mul_hi_u32 v13, v8, v4
-; GFX9-NEXT: v_mul_lo_u32 v9, v9, v4
-; GFX9-NEXT: v_mul_lo_u32 v8, v8, v4
-; GFX9-NEXT: v_add_u32_e32 v7, v7, v11
-; GFX9-NEXT: v_add3_u32 v9, v13, v12, v9
-; GFX9-NEXT: v_mul_lo_u32 v12, v4, v9
-; GFX9-NEXT: v_mul_hi_u32 v13, v4, v8
-; GFX9-NEXT: v_mul_hi_u32 v14, v4, v9
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8
-; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8
-; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8
-; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9
-; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v12, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v16, v8, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v1
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v8
-; GFX9-NEXT: v_xor_b32_e32 v9, v9, v8
-; GFX9-NEXT: v_mul_lo_u32 v10, v9, v7
-; GFX9-NEXT: v_mul_hi_u32 v11, v9, v4
+; GFX9-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GFX9-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
+; GFX9-NEXT: v_trunc_f32_e32 v6, v6
+; GFX9-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT: v_mul_lo_u32 v9, v8, v5
+; GFX9-NEXT: v_mul_hi_u32 v10, v7, v5
+; GFX9-NEXT: v_mul_lo_u32 v11, v7, v6
+; GFX9-NEXT: v_mul_lo_u32 v12, v7, v5
+; GFX9-NEXT: v_add3_u32 v9, v10, v11, v9
+; GFX9-NEXT: v_mul_lo_u32 v11, v5, v9
+; GFX9-NEXT: v_mul_hi_u32 v13, v5, v12
+; GFX9-NEXT: v_mul_hi_u32 v10, v5, v9
+; GFX9-NEXT: v_mul_hi_u32 v16, v6, v9
+; GFX9-NEXT: v_mul_lo_u32 v9, v6, v9
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11
+; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12
+; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[4:5], v5, v9
+; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT: v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT: v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT: v_mul_lo_u32 v7, v7, v5
+; GFX9-NEXT: v_add_u32_e32 v6, v6, v10
+; GFX9-NEXT: v_add3_u32 v8, v12, v11, v8
+; GFX9-NEXT: v_mul_lo_u32 v13, v5, v8
+; GFX9-NEXT: v_mul_hi_u32 v16, v5, v7
+; GFX9-NEXT: v_mul_hi_u32 v17, v5, v8
; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT: v_mul_lo_u32 v12, v1, v4
-; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7
-; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v10, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v5, v4
-; GFX9-NEXT: v_mul_lo_u32 v11, v6, v7
-; GFX9-NEXT: v_mul_hi_u32 v12, v6, v4
-; GFX9-NEXT: v_mul_lo_u32 v13, v6, v4
-; GFX9-NEXT: v_add3_u32 v10, v12, v11, v10
-; GFX9-NEXT: v_sub_u32_e32 v11, v1, v10
-; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v13
-; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v6
-; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[6:7], 0, v11, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 2, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v7, s[6:7]
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v17, s[6:7], 1, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v18, s[6:7], 0, v7, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT: v_subb_co_u32_e64 v5, s[4:5], v11, v5, s[4:5]
-; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v12, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc
-; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5]
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v15, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT: v_xor_b32_e32 v10, v8, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, v4, v10
-; GFX9-NEXT: v_xor_b32_e32 v5, v5, v8
-; GFX9-NEXT: v_xor_b32_e32 v7, v7, v10
-; GFX9-NEXT: v_sub_co_u32_e64 v3, s[8:9], v3, v10
-; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9]
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v8, vcc
+; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7
+; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v16, v13
+; GFX9-NEXT: v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v9, v8
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v13, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT: v_mul_lo_u32 v8, v0, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT: v_mul_hi_u32 v10, v0, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v15, v10, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v1, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT: v_mul_hi_u32 v11, v1, v6
+; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v14, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v8, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v3, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, v2, v6
+; GFX9-NEXT: v_mul_hi_u32 v10, v2, v5
+; GFX9-NEXT: v_mul_lo_u32 v11, v2, v5
+; GFX9-NEXT: v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT: v_sub_u32_e32 v9, v1, v8
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v11
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc
+; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[6:7], 0, v9, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 2, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v6, s[6:7]
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 1, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v6, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v12
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v14, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v9, v3, s[4:5]
+; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v10, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v15, v13, s[6:7]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, v7, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
+; GFX9-NEXT: v_xor_b32_e32 v4, v5, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: v_xor_b32_e32 v6, v6, v8
+; GFX9-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT: v_subb_co_u32_e64 v5, s[8:9], v6, v8, s[8:9]
+; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v0, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB8_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[10:11]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
@@ -910,7 +917,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: ; %bb.3:
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v2
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, v5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -919,24 +927,23 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2
-; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc
-; GFX9-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc
; GFX9-NEXT: BB8_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-NEXT: v_mov_b32_e32 v3, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
%d = sdiv i64 %a, %b
%r = srem i64 %a, %b
@@ -1036,40 +1043,42 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mul_lo_u32 v9, v2, v4
; GFX9-NEXT: v_add3_u32 v6, v8, v7, v6
; GFX9-NEXT: v_sub_u32_e32 v7, v1, v6
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v9
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v9
; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v8, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v7, s[4:5]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v3
+; GFX9-NEXT: v_sub_co_u32_e64 v8, s[4:5], v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[6:7], 0, v7, s[4:5]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7]
-; GFX9-NEXT: v_add_co_u32_e64 v12, s[6:7], 2, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v13, s[6:7], 0, v5, s[6:7]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v11, s[6:7], 2, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7]
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v14, s[6:7], 1, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[6:7], 0, v5, s[6:7]
+; GFX9-NEXT: v_add_co_u32_e64 v13, s[6:7], 1, v4
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7]
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v11
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v14, v12, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v12, s[6:7]
; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v7, v3, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v9, v2
+; GFX9-NEXT: v_sub_co_u32_e64 v2, s[4:5], v8, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[6:7]
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v6, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v2, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX9-NEXT: BB9_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[8:9]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index b86db5a7ac689..eedb973f6d167 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -86,7 +86,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb
; GCN-NEXT: s_cbranch_execz [[THEN_INNER:BB[0-9_]+]]
; GCN-NEXT: ; %bb.{{[0-9]+}}:
; GCN: store_dword
-; GCN-NEXT: {{^}}[[THEN_INNER]]:
+; GCN: {{^}}[[THEN_INNER]]:
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]]
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
@@ -136,7 +136,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
; GCN: store_dword
; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]]
-; GCN-NEXT: {{^}}[[THEN_OUTER]]:
+; GCN: {{^}}[[THEN_OUTER]]:
; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]]
; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d3d2b6949609c..a687d150914a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -330,6 +330,8 @@
; GCN-O1-NEXT: Process Implicit Definitions
; GCN-O1-NEXT: Remove unreachable machine basic blocks
; GCN-O1-NEXT: Live Variable Analysis
+; GCN-O1-NEXT: MachineDominator Tree Construction
+; GCN-O1-NEXT: SI Optimize VGPR LiveRange
; GCN-O1-NEXT: Eliminate PHI nodes for register allocation
; GCN-O1-NEXT: SI Lower control flow pseudo instructions
; GCN-O1-NEXT: Two-Address instruction pass
@@ -610,6 +612,7 @@
; GCN-O1-OPTS-NEXT: Process Implicit Definitions
; GCN-O1-OPTS-NEXT: Remove unreachable machine basic blocks
; GCN-O1-OPTS-NEXT: Live Variable Analysis
+; GCN-O1-OPTS-NEXT: SI Optimize VGPR LiveRange
; GCN-O1-OPTS-NEXT: Eliminate PHI nodes for register allocation
; GCN-O1-OPTS-NEXT: SI Lower control flow pseudo instructions
; GCN-O1-OPTS-NEXT: Two-Address instruction pass
@@ -890,6 +893,7 @@
; GCN-O2-NEXT: Process Implicit Definitions
; GCN-O2-NEXT: Remove unreachable machine basic blocks
; GCN-O2-NEXT: Live Variable Analysis
+; GCN-O2-NEXT: SI Optimize VGPR LiveRange
; GCN-O2-NEXT: Eliminate PHI nodes for register allocation
; GCN-O2-NEXT: SI Lower control flow pseudo instructions
; GCN-O2-NEXT: Two-Address instruction pass
@@ -1184,6 +1188,7 @@
; GCN-O3-NEXT: Process Implicit Definitions
; GCN-O3-NEXT: Remove unreachable machine basic blocks
; GCN-O3-NEXT: Live Variable Analysis
+; GCN-O3-NEXT: SI Optimize VGPR LiveRange
; GCN-O3-NEXT: Eliminate PHI nodes for register allocation
; GCN-O3-NEXT: SI Lower control flow pseudo instructions
; GCN-O3-NEXT: Two-Address instruction pass
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 07c2575ab422f..5e9b8091c834d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -164,15 +164,16 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out,
; SI-NEXT: s_cbranch_execz BB3_2
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v1, v[1:2], s[8:11], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0
; SI-NEXT: s_and_b64 s[8:9], vcc, exec
; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: BB3_2: ; %Flow
; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; SI-NEXT: s_xor_b64 exec, exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index bc84fe8adab60..aec41958fe0a7 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1160,6 +1160,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; SI-NEXT: s_cbranch_execz BB14_3
; SI-NEXT: ; %bb.1: ; %kill
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_cbranch_scc0 BB14_6
; SI-NEXT: ; %bb.2: ; %kill
; SI-NEXT: s_mov_b64 exec, 0
@@ -1197,6 +1199,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX10-WAVE64-NEXT: s_cbranch_execz BB14_3
; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill
; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr0
+; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr1
; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB14_6
; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill
; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0
@@ -1234,6 +1238,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX10-WAVE32-NEXT: s_cbranch_execz BB14_3
; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill
; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo
+; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr0
+; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr1
; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB14_6
; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill
; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
new file mode 100644
index 0000000000000..aa721f9fcabcd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; a normal if-else
+define amdgpu_ps float @else1(i32 %z, float %v) #0 {
+ ; SI-LABEL: name: else1
+ ; SI: bb.0.main_body:
+ ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; SI: liveins: $vgpr0, $vgpr1
+ ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+ ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+ ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
+ ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.3
+ ; SI: bb.1.Flow:
+ ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3
+ ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3
+ ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.2
+ ; SI: bb.2.if:
+ ; SI: successors: %bb.4(0x80000000)
+ ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec
+ ; SI: S_BRANCH %bb.4
+ ; SI: bb.3.else:
+ ; SI: successors: %bb.1(0x80000000)
+ ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec
+ ; SI: S_BRANCH %bb.1
+ ; SI: bb.4.end:
+ ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2
+ ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: $vgpr0 = COPY killed [[PHI2]]
+ ; SI: SI_RETURN_TO_EPILOG killed $vgpr0
+main_body:
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = fmul float %v, 2.0
+ br label %end
+
+else:
+ %v.else = fmul float %v, 3.0
+ br label %end
+
+end:
+ %r = phi float [ %v.if, %if ], [ %v.else, %else ]
+ ret float %r
+}
+
+
+; %v was used after if-else
+define amdgpu_ps float @else2(i32 %z, float %v) #0 {
+ ; SI-LABEL: name: else2
+ ; SI: bb.0.main_body:
+ ; SI: successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ ; SI: liveins: $vgpr0, $vgpr1
+ ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+ ; SI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+ ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
+ ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.3
+ ; SI: bb.1.Flow:
+ ; SI: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; SI: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3
+ ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.2
+ ; SI: bb.2.if:
+ ; SI: successors: %bb.4(0x80000000)
+ ; SI: %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec
+ ; SI: S_BRANCH %bb.4
+ ; SI: bb.3.else:
+ ; SI: successors: %bb.1(0x80000000)
+ ; SI: %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec
+ ; SI: S_BRANCH %bb.1
+ ; SI: bb.4.end:
+ ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2
+ ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2
+ ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec
+ ; SI: $vgpr0 = COPY killed %14
+ ; SI: SI_RETURN_TO_EPILOG killed $vgpr0
+main_body:
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = fmul float %v, 2.0
+ br label %end
+
+else:
+ %v.else = fmul float %v, 3.0
+ br label %end
+
+end:
+ %r0 = phi float [ %v.if, %if ], [ %v, %else ]
+ %r1 = phi float [ %v.if, %if ], [ %v.else, %else ]
+ %r2 = fadd float %r0, %r1
+ ret float %r2
+}
+
+; if-else inside loop, %x can be optimized, but %v cannot be.
+define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
+ ; SI-LABEL: name: else3
+ ; SI: bb.0.entry:
+ ; SI: successors: %bb.1(0x80000000)
+ ; SI: liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2
+ ; SI: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2
+ ; SI: [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0
+ ; SI: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+ ; SI: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+ ; SI: [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec
+ ; SI: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; SI: bb.1.for.body:
+ ; SI: successors: %bb.4(0x40000000), %bb.2(0x40000000)
+ ; SI: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5
+ ; SI: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5
+ ; SI: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.4
+ ; SI: bb.2.Flow:
+ ; SI: successors: %bb.3(0x40000000), %bb.5(0x40000000)
+ ; SI: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.1, %10, %bb.4
+ ; SI: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.1, %9, %bb.4
+ ; SI: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %40:vgpr_32, %bb.4
+ ; SI: [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: S_BRANCH %bb.3
+ ; SI: bb.3.if:
+ ; SI: successors: %bb.5(0x80000000)
+ ; SI: %7:vgpr_32 = nofpexcept V_MUL_F32_e32 [[PHI]], [[COPY2]], implicit $mode, implicit $exec
+ ; SI: %8:vgpr_32, dead %32:sreg_64 = V_ADD_CO_U32_e64 1, killed [[PHI4]], 0, implicit $exec
+ ; SI: S_BRANCH %bb.5
+ ; SI: bb.4.else:
+ ; SI: successors: %bb.2(0x80000000)
+ ; SI: %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec
+ ; SI: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+ ; SI: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]]
+ ; SI: S_BRANCH %bb.2
+ ; SI: bb.5.if.end:
+ ; SI: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+ ; SI: [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, %7, %bb.3
+ ; SI: [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, %8, %bb.3
+ ; SI: SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ ; SI: %13:vgpr_32, dead %34:sreg_64 = V_ADD_CO_U32_e64 1, [[PHI6]], 0, implicit $exec
+ ; SI: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc
+ ; SI: S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc
+ ; SI: S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ ; SI: S_BRANCH %bb.6
+ ; SI: bb.6.for.end:
+ ; SI: %35:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
+ ; SI: $vgpr0 = COPY killed %35
+ ; SI: SI_RETURN_TO_EPILOG killed $vgpr0
+entry:
+; %break = icmp sgt i32 %bound, 0
+; br i1 %break, label %for.body, label %for.end
+ br label %for.body
+
+for.body:
+ %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+ %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ]
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %i.tmp = bitcast i32 %i to float
+ %v.if = fmul float %v, %i.tmp
+ %x.if = add i32 %x, 1
+ br label %if.end
+
+else:
+ %x.tmp = bitcast i32 %x to float
+ %v.else = fmul float %v, %x.tmp
+ %x.else = mul i32 %x, 3
+ br label %if.end
+
+if.end:
+ %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ]
+ %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ]
+
+ %xinc = add i32 %x.endif, 1
+ %inc = add i32 %i, 1
+ %cond = icmp slt i32 %inc, %bound
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %x_float = bitcast i32 %x.endif to float
+ %r = fadd float %x_float, %v.endif
+ ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
new file mode 100644
index 0000000000000..0b4859eba68c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; a normal if-else
+define amdgpu_ps float @else1(i32 %z, float %v) #0 {
+; SI-LABEL: else1:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; SI-NEXT: ; %bb.1: ; %else
+; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
+; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; %bb.2: ; %Flow
+; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; SI-NEXT: s_xor_b64 exec, exec, s[0:1]
+; SI-NEXT: ; %bb.3: ; %if
+; SI-NEXT: v_add_f32_e32 v0, v1, v1
+; SI-NEXT: ; %bb.4: ; %end
+; SI-NEXT: s_or_b64 exec, exec, s[0:1]
+; SI-NEXT: ; return to shader part epilog
+main_body:
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = fmul float %v, 2.0
+ br label %end
+
+else:
+ %v.else = fmul float %v, 3.0
+ br label %end
+
+end:
+ %r = phi float [ %v.if, %if ], [ %v.else, %else ]
+ ret float %r
+}
+
+
+; %v was used after if-else
+define amdgpu_ps float @else2(i32 %z, float %v) #0 {
+; SI-LABEL: else2:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; SI-NEXT: ; %bb.1: ; %else
+; SI-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
+; SI-NEXT: ; %bb.2: ; %Flow
+; SI-NEXT: s_or_saveexec_b64 s[0:1], s[0:1]
+; SI-NEXT: s_xor_b64 exec, exec, s[0:1]
+; SI-NEXT: ; %bb.3: ; %if
+; SI-NEXT: v_add_f32_e32 v1, v1, v1
+; SI-NEXT: v_mov_b32_e32 v0, v1
+; SI-NEXT: ; %bb.4: ; %end
+; SI-NEXT: s_or_b64 exec, exec, s[0:1]
+; SI-NEXT: v_add_f32_e32 v0, v1, v0
+; SI-NEXT: ; return to shader part epilog
+main_body:
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %v.if = fmul float %v, 2.0
+ br label %end
+
+else:
+ %v.else = fmul float %v, 3.0
+ br label %end
+
+end:
+ %r0 = phi float [ %v.if, %if ], [ %v, %else ]
+ %r1 = phi float [ %v.if, %if ], [ %v.else, %else ]
+ %r2 = fadd float %r0, %r1
+ ret float %r2
+}
+
+; if-else inside loop, %x can be optimized, but %v cannot be.
+define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
+; SI-LABEL: else3:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: s_branch BB2_2
+; SI-NEXT: BB2_1: ; %if.end
+; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: s_add_i32 s1, s1, 1
+; SI-NEXT: s_cmp_lt_i32 s1, s0
+; SI-NEXT: v_add_u32_e64 v2, s[2:3], 1, v0
+; SI-NEXT: s_cbranch_scc0 BB2_6
+; SI-NEXT: BB2_2: ; %for.body
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: ; implicit-def: $vgpr3
+; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT: ; %bb.3: ; %else
+; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT: v_mul_lo_u32 v0, v2, 3
+; SI-NEXT: v_mul_f32_e32 v3, v1, v2
+; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; %bb.4: ; %Flow
+; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT: s_or_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT: s_xor_b64 exec, exec, s[4:5]
+; SI-NEXT: s_cbranch_execz BB2_1
+; SI-NEXT: ; %bb.5: ; %if
+; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT: v_mul_f32_e32 v3, s1, v1
+; SI-NEXT: v_add_u32_e64 v0, s[2:3], 1, v2
+; SI-NEXT: s_branch BB2_1
+; SI-NEXT: BB2_6: ; %for.end
+; SI-NEXT: v_add_f32_e32 v0, v0, v3
+; SI-NEXT: ; return to shader part epilog
+entry:
+; %break = icmp sgt i32 %bound, 0
+; br i1 %break, label %for.body, label %for.end
+ br label %for.body
+
+for.body:
+ %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+ %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ]
+ %cc = icmp sgt i32 %z, 5
+ br i1 %cc, label %if, label %else
+
+if:
+ %i.tmp = bitcast i32 %i to float
+ %v.if = fmul float %v, %i.tmp
+ %x.if = add i32 %x, 1
+ br label %if.end
+
+else:
+ %x.tmp = bitcast i32 %x to float
+ %v.else = fmul float %v, %x.tmp
+ %x.else = mul i32 %x, 3
+ br label %if.end
+
+if.end:
+ %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ]
+ %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ]
+
+ %xinc = add i32 %x.endif, 1
+ %inc = add i32 %i, 1
+ %cond = icmp slt i32 %inc, %bound
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %x_float = bitcast i32 %x.endif to float
+ %r = fadd float %x_float, %v.endif
+ ret float %r
+}
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list