[llvm] 208332d - [AMDGPU] Add Optimize VGPR LiveRange Pass.

Ruiling Song via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 21 00:27:17 PDT 2021


Author: Ruiling Song
Date: 2021-06-21T15:25:55+08:00
New Revision: 208332de8abf126b6fb5590bea47cd12257bc064

URL: https://github.com/llvm/llvm-project/commit/208332de8abf126b6fb5590bea47cd12257bc064
DIFF: https://github.com/llvm/llvm-project/commit/208332de8abf126b6fb5590bea47cd12257bc064.diff

LOG: [AMDGPU] Add Optimize VGPR LiveRange Pass.

This pass aims to optimize VGPR live-range in a typical divergent if-else
control flow. For example:

def(a)
if(cond)
  use(a)
  ... // A
else
  use(a)

As AMDGPU access vgpr with respect to active-mask, we can mark `a` as
dead in region A. For details, please refer to the comments in
implementation file.

The pass is enabled by default, the frontend can disable it through
"-amdgpu-opt-vgpr-liverange=false".

Differential Revision: https://reviews.llvm.org/D102212

Added: 
    llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 2cfda5533dbb9..fa3c7e657fc1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -56,6 +56,7 @@ FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIOptimizeExecMaskingPreRAPass();
+FunctionPass *createSIOptimizeVGPRLiveRangePass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
@@ -297,6 +298,9 @@ struct AMDGPUUnifyMetadataPass : PassInfoMixin<AMDGPUUnifyMetadataPass> {
 void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&);
 extern char &SIOptimizeExecMaskingPreRAID;
 
+void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &);
+extern char &SIOptimizeVGPRLiveRangeID;
+
 void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
 extern char &AMDGPUAnnotateUniformValuesPassID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 63b3a8d3b29e0..2c1e5092b26b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -162,6 +162,11 @@ static cl::opt<bool> EnableRegReassign(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool> OptVGPRLiveRange(
+    "amdgpu-opt-vgpr-liverange",
+    cl::desc("Enable VGPR liverange optimizations for if-else structure"),
+    cl::init(true), cl::Hidden);
+
 // Enable atomic optimization
 static cl::opt<bool> EnableAtomicOptimizations(
   "amdgpu-atomic-optimizations",
@@ -225,6 +230,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
+  initializeSIOptimizeVGPRLiveRangePass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUFixFunctionBitcastsPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
@@ -1190,6 +1196,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   if (TM->getOptLevel() > CodeGenOpt::Less)
     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
+  // FIXME: when an instruction has a Killed operand, and the instruction is
+  // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
+  // the register in LiveVariables, this would trigger a failure in verifier,
+  // we should fix it and enable the verifier.
+  if (OptVGPRLiveRange)
+    insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
   // SI_ELSE will introduce a copy of the tied operand source after the else.

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index bf44ad6a000d4..0e3ea8d313a26 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -132,6 +132,7 @@ add_llvm_target(AMDGPUCodeGen
   SIMemoryLegalizer.cpp
   SIOptimizeExecMasking.cpp
   SIOptimizeExecMaskingPreRA.cpp
+  SIOptimizeVGPRLiveRange.cpp
   SIPeepholeSDWA.cpp
   SIPostRABundler.cpp
   SIPreEmitPeephole.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
new file mode 100644
index 0000000000000..8e8d4bff672ff
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -0,0 +1,497 @@
+//===--------------------- SIOptimizeVGPRLiveRange.cpp  -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass tries to remove unnecessary VGPR live range in divergent if-else
+/// structure.
+///
+/// When we do structurization, we usually transform a if-else into two
+/// sucessive if-then (with a flow block to do predicate inversion). Consider a
+/// simple case after structurization: A divergent value %a was defined before
+/// if-else and used in both THEN (use in THEN is optional) and ELSE part:
+///    bb.if:
+///      %a = ...
+///      ...
+///    bb.then:
+///      ... = op %a
+///      ... // %a can be dead here
+///    bb.flow:
+///      ...
+///    bb.else:
+///      ... = %a
+///      ...
+///    bb.endif
+///
+///  As register allocator has no idea of the thread-control-flow, it will just
+///  assume %a would be alive in the whole range of bb.then because of a later
+///  use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
+///  to exec mask. For this if-else case, the lanes active in bb.then will be
+///  inactive in bb.else, and vice-verse. So we are safe to say that %a was dead
+///  after the last use in bb.then untill the end of the block. The reason is
+///  the instructions in bb.then will only overwrite lanes that will never be
+///  accessed in bb.else.
+///
+///  This pass aims to to tell register allocator that %a is in-fact dead,
+///  through inserting a phi-node in bb.flow saying that %a is undef when coming
+///  from bb.then, and then replace the uses in the bb.else with the result of
+///  newly inserted phi.
+///
+///  Two key conditions must be met to ensure correctness:
+///  1.) The def-point should be in the same loop-level as if-else-endif to make
+///      sure the second loop iteration still get correct data.
+///  2.) There should be no further uses after the IF-ELSE region.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-opt-vgpr-liverange"
+
+namespace {
+
+class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
+private:
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  LiveVariables *LV = nullptr;
+  MachineDominatorTree *MDT = nullptr;
+  const MachineLoopInfo *Loops = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+public:
+  static char ID;
+
+  MachineBasicBlock *getElseTarget(MachineBasicBlock *MBB) const;
+
+  void collectElseRegionBlocks(MachineBasicBlock *Flow,
+                               MachineBasicBlock *Endif,
+                               SmallSetVector<MachineBasicBlock *, 16> &) const;
+
+  void
+  collectCandidateRegisters(MachineBasicBlock *If, MachineBasicBlock *Flow,
+                            MachineBasicBlock *Endif,
+                            SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+                            SmallVectorImpl<Register> &CandidateRegs) const;
+
+  void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
+                             SmallVectorImpl<MachineInstr *> &Uses) const;
+
+  void updateLiveRangeInThenRegion(Register Reg, MachineBasicBlock *If,
+                                   MachineBasicBlock *Flow) const;
+
+  void updateLiveRangeInElseRegion(
+      Register Reg, Register NewReg, MachineBasicBlock *Flow,
+      MachineBasicBlock *Endif,
+      SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+  void
+  optimizeLiveRange(Register Reg, MachineBasicBlock *If,
+                    MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+                    SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
+
+  SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI Optimize VGPR LiveRange";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveVariables>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<LiveVariables>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
+
+} // end anonymous namespace
+
+// Check whether the MBB is a else flow block and get the branching target which
+// is the Endif block
+MachineBasicBlock *
+SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
+  for (auto &BR : MBB->terminators()) {
+    if (BR.getOpcode() == AMDGPU::SI_ELSE)
+      return BR.getOperand(2).getMBB();
+  }
+  return nullptr;
+}
+
+void SIOptimizeVGPRLiveRange::collectElseRegionBlocks(
+    MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+    SmallSetVector<MachineBasicBlock *, 16> &Blocks) const {
+  assert(Flow != Endif);
+
+  MachineBasicBlock *MBB = Endif;
+  unsigned Cur = 0;
+  while (MBB) {
+    for (auto *Pred : MBB->predecessors()) {
+      if (Pred != Flow && !Blocks.contains(Pred))
+        Blocks.insert(Pred);
+    }
+
+    if (Cur < Blocks.size())
+      MBB = Blocks[Cur++];
+    else
+      MBB = nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "Found Else blocks: ");
+  for (auto *MBB : Blocks)
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ' ');
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
+/// Find the instructions(excluding phi) in \p MBB that uses the \p Reg.
+void SIOptimizeVGPRLiveRange::findNonPHIUsesInBlock(
+    Register Reg, MachineBasicBlock *MBB,
+    SmallVectorImpl<MachineInstr *> &Uses) const {
+  for (auto &UseMI : MRI->use_nodbg_instructions(Reg)) {
+    if (UseMI.getParent() == MBB && !UseMI.isPHI())
+      Uses.push_back(&UseMI);
+  }
+}
+
+/// Collect the killed registers in the ELSE region which are not alive through
+/// the whole THEN region.
+void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
+    MachineBasicBlock *If, MachineBasicBlock *Flow, MachineBasicBlock *Endif,
+    SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
+    SmallVectorImpl<Register> &CandidateRegs) const {
+
+  SmallSet<Register, 8> KillsInElse;
+
+  for (auto *Else : ElseBlocks) {
+    for (auto &MI : Else->instrs()) {
+      if (MI.isDebugInstr())
+        continue;
+
+      for (auto &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.getReg() || MO.isDef())
+          continue;
+
+        Register MOReg = MO.getReg();
+        // We can only optimize AGPR/VGPR virtual register
+        if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+          continue;
+
+        if (MO.isKill() && MO.readsReg()) {
+          LiveVariables::VarInfo &VI = LV->getVarInfo(MOReg);
+          const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+          // Make sure two conditions are met:
+          // a.) the value is defined before/in the IF block
+          // b.) should be defined in the same loop-level.
+          if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+              Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+            KillsInElse.insert(MOReg);
+        }
+      }
+    }
+  }
+
+  // Check the phis in the Endif, looking for value coming from the ELSE
+  // region. Make sure the phi-use is the last use.
+  for (auto &MI : Endif->phis()) {
+    for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+      auto &MO = MI.getOperand(Idx);
+      auto *Pred = MI.getOperand(Idx + 1).getMBB();
+      if (Pred == Flow)
+        continue;
+      assert(ElseBlocks.contains(Pred) && "Should be from Else region\n");
+
+      if (!MO.isReg() || !MO.getReg() || MO.isUndef())
+        continue;
+
+      Register Reg = MO.getReg();
+      if (Reg.isPhysical() || !TRI->isVectorRegister(*MRI, Reg))
+        continue;
+
+      LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+
+      if (VI.isLiveIn(*Endif, Reg, *MRI)) {
+        LLVM_DEBUG(dbgs() << "Excluding " << printReg(Reg, TRI)
+                          << " as Live in Endif\n");
+        continue;
+      }
+      // Make sure two conditions are met:
+      // a.) the value is defined before/in the IF block
+      // b.) should be defined in the same loop-level.
+      const MachineBasicBlock *DefMBB = MRI->getVRegDef(Reg)->getParent();
+      if ((VI.AliveBlocks.test(If->getNumber()) || DefMBB == If) &&
+          Loops->getLoopFor(DefMBB) == Loops->getLoopFor(If))
+        KillsInElse.insert(Reg);
+    }
+  }
+
+  auto IsLiveThroughThen = [&](Register Reg) {
+    for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+         ++I) {
+      if (!I->readsReg())
+        continue;
+      auto *UseMI = I->getParent();
+      auto *UseMBB = UseMI->getParent();
+      if (UseMBB == Flow || UseMBB == Endif) {
+        if (!UseMI->isPHI())
+          return true;
+
+        auto *IncomingMBB = UseMI->getOperand(I.getOperandNo() + 1).getMBB();
+        // The register is live through the path If->Flow or Flow->Endif.
+        // we should not optimize for such cases.
+        if ((UseMBB == Flow && IncomingMBB != If) ||
+            (UseMBB == Endif && IncomingMBB == Flow))
+          return true;
+      }
+    }
+    return false;
+  };
+
+  for (auto Reg : KillsInElse) {
+    if (!IsLiveThroughThen(Reg))
+      CandidateRegs.push_back(Reg);
+  }
+}
+
+// Re-calculate the liveness of \p Reg in the THEN-region
+void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
+    Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
+
+  SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
+
+  MachineBasicBlock *ThenEntry = nullptr;
+  for (auto *Succ : If->successors()) {
+    if (Succ != Flow) {
+      ThenEntry = Succ;
+      break;
+    }
+  }
+  assert(ThenEntry && "No successor in Then region?");
+
+  LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+  df_iterator_default_set<MachineBasicBlock *, 16> Visited;
+
+  for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+    if (MBB == Flow)
+      break;
+
+    // Clear Live bit, as we will recalculate afterwards
+    LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
+                      << '\n');
+    OldVarInfo.AliveBlocks.reset(MBB->getNumber());
+  }
+
+  // Get the blocks the Reg should be alive through
+  for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
+       ++I) {
+    auto *UseMI = I->getParent();
+    if (UseMI->isPHI() && I->readsReg()) {
+      if (Visited.contains(UseMI->getParent()))
+        PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
+    }
+  }
+
+  Visited.clear();
+
+  for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
+    if (MBB == Flow)
+      break;
+
+    SmallVector<MachineInstr *> Uses;
+    // PHI instructions has been processed before.
+    findNonPHIUsesInBlock(Reg, MBB, Uses);
+
+    if (Uses.size() == 1) {
+      LLVM_DEBUG(dbgs() << "Found one Non-PHI use in "
+                        << printMBBReference(*MBB) << '\n');
+      LV->HandleVirtRegUse(Reg, MBB, *(*Uses.begin()));
+    } else if (Uses.size() > 1) {
+      // Process the instructions in-order
+      LLVM_DEBUG(dbgs() << "Found " << Uses.size() << " Non-PHI uses in "
+                        << printMBBReference(*MBB) << '\n');
+      for (MachineInstr &MI : *MBB) {
+        if (llvm::is_contained(Uses, &MI))
+          LV->HandleVirtRegUse(Reg, MBB, MI);
+      }
+    }
+
+    // Mark Reg alive through the block if this is a PHI incoming block
+    if (PHIIncoming.contains(MBB))
+      LV->MarkVirtRegAliveInBlock(OldVarInfo, MRI->getVRegDef(Reg)->getParent(),
+                                  MBB);
+  }
+
+  // Set the isKilled flag if we get new Kills in the THEN region.
+  for (auto *MI : OldVarInfo.Kills) {
+    if (Visited.contains(MI->getParent()))
+      MI->addRegisterKilled(Reg, TRI);
+  }
+}
+
+void SIOptimizeVGPRLiveRange::updateLiveRangeInElseRegion(
+    Register Reg, Register NewReg, MachineBasicBlock *Flow,
+    MachineBasicBlock *Endif,
+    SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+  LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+  LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+  // Transfer aliveBlocks from Reg to NewReg
+  for (auto *MBB : ElseBlocks) {
+    unsigned BBNum = MBB->getNumber();
+    if (OldVarInfo.AliveBlocks.test(BBNum)) {
+      NewVarInfo.AliveBlocks.set(BBNum);
+      LLVM_DEBUG(dbgs() << "Removing ALiveBlock " << printMBBReference(*MBB)
+                        << '\n');
+      OldVarInfo.AliveBlocks.reset(BBNum);
+    }
+  }
+
+  // Transfer the possible Kills in ElseBlocks from Reg to NewReg
+  auto I = OldVarInfo.Kills.begin();
+  while (I != OldVarInfo.Kills.end()) {
+    if (ElseBlocks.contains((*I)->getParent())) {
+      NewVarInfo.Kills.push_back(*I);
+      I = OldVarInfo.Kills.erase(I);
+    } else {
+      ++I;
+    }
+  }
+}
+
+void SIOptimizeVGPRLiveRange::optimizeLiveRange(
+    Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow,
+    MachineBasicBlock *Endif,
+    SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const {
+  // Insert a new PHI, marking the value from the THEN region being
+  // undef.
+  LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+  const auto *RC = MRI->getRegClass(Reg);
+  Register NewReg = MRI->createVirtualRegister(RC);
+  Register UndefReg = MRI->createVirtualRegister(RC);
+  MachineInstrBuilder PHI = BuildMI(*Flow, Flow->getFirstNonPHI(), DebugLoc(),
+                                    TII->get(TargetOpcode::PHI), NewReg);
+  for (auto *Pred : Flow->predecessors()) {
+    if (Pred == If)
+      PHI.addReg(Reg).addMBB(Pred);
+    else
+      PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+  }
+
+  // Replace all uses in the ELSE region or the PHIs in ENDIF block
+  for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) {
+    MachineOperand &O = *I;
+    // This is a little bit tricky, the setReg() will update the linked list,
+    // so we have to increment the iterator before setReg() to avoid skipping
+    // some uses.
+    ++I;
+    auto *UseMI = O.getParent();
+    auto *UseBlock = UseMI->getParent();
+    // Replace uses in Endif block
+    if (UseBlock == Endif) {
+      assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
+      O.setReg(NewReg);
+      continue;
+    }
+
+    // Replace uses in Else region
+    if (ElseBlocks.contains(UseBlock))
+      O.setReg(NewReg);
+  }
+
+  // The optimized Reg is not alive through Flow blocks anymore.
+  LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+  OldVarInfo.AliveBlocks.reset(Flow->getNumber());
+
+  updateLiveRangeInElseRegion(Reg, NewReg, Flow, Endif, ElseBlocks);
+  updateLiveRangeInThenRegion(Reg, If, Flow);
+}
+
+char SIOptimizeVGPRLiveRange::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+                      "SI Optimize VGPR LiveRange", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
+                    "SI Optimize VGPR LiveRange", false, false)
+
+char &llvm::SIOptimizeVGPRLiveRangeID = SIOptimizeVGPRLiveRange::ID;
+
+FunctionPass *llvm::createSIOptimizeVGPRLiveRangePass() {
+  return new SIOptimizeVGPRLiveRange();
+}
+
+bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  LV = &getAnalysis<LiveVariables>();
+  MRI = &MF.getRegInfo();
+
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  bool MadeChange = false;
+
+  // TODO: we need to think about the order of visiting the blocks to get
+  // optimal result for nesting if-else cases.
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto &MI : MBB.terminators()) {
+      // Detect the if-else blocks
+      if (MI.getOpcode() == AMDGPU::SI_IF) {
+        MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
+        auto *Endif = getElseTarget(IfTarget);
+        if (!Endif)
+          continue;
+
+        SmallSetVector<MachineBasicBlock *, 16> ElseBlocks;
+        SmallVector<Register> CandidateRegs;
+
+        LLVM_DEBUG(dbgs() << "Checking IF-ELSE-ENDIF: "
+                          << printMBBReference(MBB) << ' '
+                          << printMBBReference(*IfTarget) << ' '
+                          << printMBBReference(*Endif) << '\n');
+
+        // Collect all the blocks in the ELSE region
+        collectElseRegionBlocks(IfTarget, Endif, ElseBlocks);
+
+        // Collect the registers can be optimized
+        collectCandidateRegisters(&MBB, IfTarget, Endif, ElseBlocks,
+                                  CandidateRegs);
+        MadeChange |= !CandidateRegs.empty();
+        // Now we are safe to optimize.
+        for (auto Reg : CandidateRegs)
+          optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+      }
+    }
+  }
+
+  return MadeChange;
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index db0329c5a050f..a899655cfd96e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -17,147 +17,149 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v4
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v5
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; CHECK-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
-; CHECK-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CHECK-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; CHECK-NEXT:    v_trunc_f32_e32 v9, v9
-; CHECK-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; CHECK-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v7, v8
-; CHECK-NEXT:    v_mul_lo_u32 v12, v11, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v10, v9
-; CHECK-NEXT:    v_mul_hi_u32 v15, v10, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v10, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v8
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v5
+; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v5
+; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v12
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CHECK-NEXT:    v_mul_hi_u32 v14, v5, v12
+; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v10
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v14
-; CHECK-NEXT:    v_mul_lo_u32 v15, v6, v12
-; CHECK-NEXT:    v_mul_hi_u32 v16, v6, v14
-; CHECK-NEXT:    v_mul_hi_u32 v14, v9, v14
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v16, v9, v12
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT:    v_mul_hi_u32 v15, v6, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v9, v12
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CHECK-NEXT:    v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v11, v11, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CHECK-NEXT:    v_mul_lo_u32 v15, v10, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v5
+; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v11
+; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v5
+; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
+; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v15
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CHECK-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
-; CHECK-NEXT:    v_mul_hi_u32 v15, v13, v15
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v12, v13, v10
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CHECK-NEXT:    v_mul_hi_u32 v14, v6, v10
-; CHECK-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
+; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v6
+; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v12, v1, v9
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_mul_hi_u32 v11, v7, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v5, v9
-; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v10
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v3
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v13, v5, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v2, v7, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v8, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
-; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -702,146 +704,148 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v0, vcc
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v5, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v5
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v8, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v1
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
-; CGP-NEXT:    v_mul_f32_e32 v13, 0x2f800000, v10
-; CGP-NEXT:    v_trunc_f32_e32 v13, v13
-; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v13
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
+; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v5
+; CGP-NEXT:    v_trunc_f32_e32 v10, v10
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v16, v15, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v19, v14, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v14, v10
-; CGP-NEXT:    v_xor_b32_e32 v9, v9, v12
+; CGP-NEXT:    v_xor_b32_e32 v9, v9, v11
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v5
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v17, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v16
+; CGP-NEXT:    v_mul_lo_u32 v17, v5, v14
+; CGP-NEXT:    v_mul_hi_u32 v18, v5, v16
+; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v5, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v18
-; CGP-NEXT:    v_mul_lo_u32 v19, v10, v16
-; CGP-NEXT:    v_mul_hi_u32 v20, v10, v18
-; CGP-NEXT:    v_mul_hi_u32 v18, v13, v18
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v20, v13, v16
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT:    v_mul_hi_u32 v19, v10, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v13, v16
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v20, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT:    v_addc_u32_e64 v17, s[4:5], v13, v16, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v15, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v19, v14, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v14, v10
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v13, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
+; CGP-NEXT:    v_mul_lo_u32 v17, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v12, v12, v5
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v17
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
+; CGP-NEXT:    v_mul_lo_u32 v16, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_mul_hi_u32 v16, v10, v19
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT:    v_mul_lo_u32 v15, v17, v19
-; CGP-NEXT:    v_mul_lo_u32 v18, v10, v14
-; CGP-NEXT:    v_mul_hi_u32 v19, v17, v19
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v16, v17, v14
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v18, v15
-; CGP-NEXT:    v_mul_hi_u32 v18, v10, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v17, v14
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v18, s[4:5], v19, v18
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v18, v16
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v14, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v5
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v14, v9, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v15, v11, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v1, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v1, v10
-; CGP-NEXT:    v_mul_lo_u32 v16, v1, v10
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v16
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v9, v14, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v14
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v5
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_mul_lo_u32 v12, v4, v5
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v10
+; CGP-NEXT:    v_mul_hi_u32 v15, v1, v5
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v9, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v4
+; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v15, v5
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v13, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v10, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v15
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v9, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v9
-; CGP-NEXT:    v_xor_b32_e32 v1, v5, v9
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v11, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v5
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -879,146 +883,148 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.5:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v4, vcc
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v7
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v8
-; CGP-NEXT:    v_trunc_f32_e32 v11, v11
-; CGP-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v11
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v6, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v9
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v9, v10
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v8
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v7
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v7, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v16
-; CGP-NEXT:    v_mul_lo_u32 v17, v8, v14
-; CGP-NEXT:    v_mul_hi_u32 v18, v8, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v18, v11, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_hi_u32 v17, v8, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v14
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v13, v8
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v12, v8
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v7
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v7
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v17
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
-; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v8, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v12, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v2, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v2, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v3, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v9, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v14, v5, v8
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v3, v12, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v12
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v7
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v6
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v7
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v15, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v7, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v9, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v10, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v3, v7
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v7, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v5
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v5
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v5
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -2516,146 +2522,148 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v2, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; CHECK-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
-; CHECK-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CHECK-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v6
-; CHECK-NEXT:    v_trunc_f32_e32 v9, v9
-; CHECK-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v9
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v4
+; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v4, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
+; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; CHECK-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v7, v8
-; CHECK-NEXT:    v_mul_lo_u32 v12, v11, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v10, v9
-; CHECK-NEXT:    v_mul_hi_u32 v15, v10, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v10, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v8
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v5
+; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v5
+; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v12
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CHECK-NEXT:    v_mul_hi_u32 v14, v5, v12
+; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v12
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v10
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v14
-; CHECK-NEXT:    v_mul_lo_u32 v15, v6, v12
-; CHECK-NEXT:    v_mul_hi_u32 v16, v6, v14
-; CHECK-NEXT:    v_mul_hi_u32 v14, v9, v14
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v16, v9, v12
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT:    v_mul_hi_u32 v15, v6, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v9, v12
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CHECK-NEXT:    v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v11, v11, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CHECK-NEXT:    v_mul_lo_u32 v15, v10, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v5
+; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v11
+; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v5
+; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
+; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v15
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CHECK-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
-; CHECK-NEXT:    v_mul_hi_u32 v15, v13, v15
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v12, v13, v10
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CHECK-NEXT:    v_mul_hi_u32 v14, v6, v10
-; CHECK-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
+; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v6
+; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v12, v1, v9
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_mul_hi_u32 v11, v7, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v12, v3, v6
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v10
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v5
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v13, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v8, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -3008,142 +3016,144 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; CGP-NEXT:    v_xor_b32_e32 v4, v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v11
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v10
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v5, v12
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v1
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
+; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v11
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v13, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v13, v13
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v13
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v6
+; CGP-NEXT:    v_trunc_f32_e32 v10, v10
+; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v16, v15, v6
-; CGP-NEXT:    v_mul_lo_u32 v17, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v19, v14, v6
-; CGP-NEXT:    v_mul_lo_u32 v18, v14, v6
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v12
+; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v17, v12, v6
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v16
+; CGP-NEXT:    v_mul_lo_u32 v17, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v18, v6, v16
+; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_hi_u32 v17, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v18
-; CGP-NEXT:    v_mul_lo_u32 v19, v6, v16
-; CGP-NEXT:    v_mul_hi_u32 v20, v6, v18
-; CGP-NEXT:    v_mul_hi_u32 v18, v13, v18
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v20, v13, v16
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT:    v_mul_hi_u32 v19, v6, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v13, v16
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v20, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
 ; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
-; CGP-NEXT:    v_addc_u32_e64 v17, s[4:5], v13, v16, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v15, v6
-; CGP-NEXT:    v_mul_lo_u32 v18, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v19, v14, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
+; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
+; CGP-NEXT:    v_mul_lo_u32 v17, v12, v6
+; CGP-NEXT:    v_mul_hi_u32 v12, v12, v6
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT:    v_mul_hi_u32 v14, v6, v17
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
+; CGP-NEXT:    v_mul_lo_u32 v16, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_mul_hi_u32 v16, v6, v19
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT:    v_mul_lo_u32 v15, v17, v19
-; CGP-NEXT:    v_mul_lo_u32 v18, v6, v14
-; CGP-NEXT:    v_mul_hi_u32 v19, v17, v19
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v16, v17, v14
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v18, v15
-; CGP-NEXT:    v_mul_hi_u32 v18, v6, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v17, v14
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v18, s[4:5], v19, v18
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v18, v16
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v14, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v15, v11, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v16, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v1, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v1, v6
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v16
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v7, v14, vcc
-; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v14
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_mul_lo_u32 v12, v4, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v10
+; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v6
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v7, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v4
 ; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v15, v4
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v15
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v16, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v16, v7, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v6, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v6
-; CGP-NEXT:    v_xor_b32_e32 v1, v4, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v11, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v5
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v5
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -3185,142 +3195,144 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v9, v6
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v9
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v6, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v9
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
-; CGP-NEXT:    v_trunc_f32_e32 v11, v11
-; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v6, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v9, v10
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v7
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v7
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v7
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v7, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v16
-; CGP-NEXT:    v_mul_lo_u32 v17, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v18, v7, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v18, v11, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_hi_u32 v17, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v14
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v13, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v7
-; CGP-NEXT:    v_mul_hi_u32 v12, v12, v7
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v7
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v7
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v17
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v12, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v2, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v3, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v9, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v5, v7
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v14
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v3, v12, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v12
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v6
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v6
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v15, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, 0, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v6, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v6, v10, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v3, v6
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v5
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v5
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v5
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 840653d8c4c54..2ae38a64fe34a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -17,145 +17,147 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v5
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v4
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
-; CHECK-NEXT:    v_trunc_f32_e32 v8, v8
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v6, v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v11, v10, v4
-; CHECK-NEXT:    v_mul_lo_u32 v12, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v9, v4
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v4
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT:    v_mul_lo_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v13
-; CHECK-NEXT:    v_mul_lo_u32 v14, v4, v11
-; CHECK-NEXT:    v_mul_hi_u32 v15, v4, v13
-; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v15, v8, v11
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CHECK-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CHECK-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v10, v10, v4
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v14, v9, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
+; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
+; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_mul_hi_u32 v11, v4, v14
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CHECK-NEXT:    v_mul_lo_u32 v13, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v14, v12, v14
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v12, v9
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v4
-; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v11, v1, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v4, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, v6, v5
-; CHECK-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v2
+; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v3
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v9, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v3, v7
-; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v1, v6, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -690,144 +692,146 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v0, vcc
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v5, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v0, v5, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v10
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v1
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v12, v12
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v18, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v5
-; CGP-NEXT:    v_xor_b32_e32 v9, v9, v11
+; CGP-NEXT:    v_xor_b32_e32 v0, v4, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v9, v9
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v9
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v16, v11, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v4
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v4, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v17, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v16, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v17
-; CGP-NEXT:    v_mul_lo_u32 v18, v5, v15
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v17
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v19, v12, v15
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT:    v_mul_hi_u32 v18, v5, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v12, v15
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT:    v_addc_u32_e64 v16, s[4:5], v12, v15, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v14, v5
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v18, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v13, v13, v5
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v9, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v16, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v11, v4
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v16
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v18
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v16, v18
-; CGP-NEXT:    v_mul_lo_u32 v17, v5, v13
-; CGP-NEXT:    v_mul_hi_u32 v18, v16, v18
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v15, v16, v13
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v16, v13
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v17, v15
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v15, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v0, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_subb_u32_e64 v12, s[4:5], v9, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v9, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v0, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, v1, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v8, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v8, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v0
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v5, v1
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, v10, v1
-; CGP-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v5, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v0
-; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v5, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v4, v0, vcc
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT:    v_xor_b32_e32 v5, v0, v11
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v11
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v5, v11, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT:    v_xor_b32_e32 v4, v0, v10
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v10
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v10, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -863,144 +867,146 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.5:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v4, vcc
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v4, v7, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v7
-; CGP-NEXT:    v_trunc_f32_e32 v10, v10
-; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v10
+; CGP-NEXT:    v_xor_b32_e32 v4, v6, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v5
+; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v7
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v7
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v7, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v17, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v16, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v12, v7
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v11, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, v11, v7
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v6
+; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v16
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v3, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, v2, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, v2, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v3, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v2, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v3, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v3, v7, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_mul_lo_u32 v9, v4, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
+; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v4
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v5
+; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v8, v5
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v4
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v11, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v9
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v9, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v8
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v8, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -2480,144 +2486,146 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:  ; %bb.1:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v2, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT:    v_xor_b32_e32 v2, v5, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CHECK-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v5
-; CHECK-NEXT:    v_trunc_f32_e32 v8, v8
-; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v8
+; CHECK-NEXT:    v_xor_b32_e32 v2, v4, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v6, v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v11, v10, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v9, v5
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v5
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
+; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v4
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
+; CHECK-NEXT:    v_mul_lo_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v13
-; CHECK-NEXT:    v_mul_lo_u32 v14, v5, v11
-; CHECK-NEXT:    v_mul_hi_u32 v15, v5, v13
-; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v15, v8, v11
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CHECK-NEXT:    v_mul_hi_u32 v14, v5, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v10, v10, v5
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v14, v9, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v9, v5
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
+; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
+; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v14
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
-; CHECK-NEXT:    v_mul_hi_u32 v14, v12, v14
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v12, v9
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v11, v1, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
-; CHECK-NEXT:    v_subb_u32_e64 v8, s[4:5], v1, v5, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v3
+; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, v6, v3
-; CHECK-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v1, v6, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -2965,139 +2973,141 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v0, v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v10
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v1
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v10, vcc
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v12, v12
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v11
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v4
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v18, v13, v4
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v4
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v7
+; CGP-NEXT:    v_mul_hi_u32 v16, v11, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v14, v7, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v4, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v17, v7, v13
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v16, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v17
-; CGP-NEXT:    v_mul_lo_u32 v18, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v19, v4, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v17
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v19, v12, v15
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
-; CGP-NEXT:    v_mul_hi_u32 v18, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v12, v15
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_addc_u32_e64 v16, s[4:5], v12, v15, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v14, v4
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v18, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v7, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v16, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v11, v4
+; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v13
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v16
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v18
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v16, v18
-; CGP-NEXT:    v_mul_lo_u32 v17, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v18, v16, v18
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v15, v16, v13
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
-; CGP-NEXT:    v_mul_hi_u32 v17, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v16, v13
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v17, s[4:5], v18, v17
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v17, v15
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v7, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v6, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT:    v_subb_u32_e64 v12, s[4:5], v7, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v0, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v0
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v5, v1
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, v6, v1
-; CGP-NEXT:    v_subbrev_u32_e64 v14, s[4:5], 0, v4, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v4, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v14, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT:    v_xor_b32_e32 v4, v0, v11
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v11
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v11, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT:    v_xor_b32_e32 v4, v0, v10
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v10
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v10, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -3138,139 +3148,141 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v4, v6, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v5
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v10, v10
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v6
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v6
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT:    v_mul_hi_u32 v14, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
-; CGP-NEXT:    v_mul_lo_u32 v16, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v6, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v17, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v16, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v12, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v6
+; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v6, v16
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v11
-; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v6, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v3, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, v2, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, v2, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v3, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v2, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v3, v6, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_mul_lo_u32 v9, v4, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v4
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v7, v5
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v5
+; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v4
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v11, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v9
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v9, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v8
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v8, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 1aadea0b9b835..4ee838f942b39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -119,30 +119,32 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
 ; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v6, v13, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v2
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v8, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v10, v8, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -739,30 +741,32 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v14
 ; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v8, v11
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v9, v10, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v9, v10, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v5, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v13, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v13, v10, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v13, v11, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v12, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v15, v16, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v16, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -901,30 +905,32 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v12
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v9
-; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v8, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v3, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v8
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v15, v8, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v6
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v9, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v10, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -2399,30 +2405,32 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
 ; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v6, v13, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v4
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v8, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v10, v8, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -2842,30 +2850,32 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v14
 ; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v5, v6
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v7, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v13, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v13, v7, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v13, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v12, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v15, v16, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v16, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -3004,30 +3014,32 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v12
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v7
-; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v7, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v10, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 954022fb71c04..580dc2f4b81d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -115,33 +115,35 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v5
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v5, v2
-; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, v7, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v10, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v7, v11, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -728,32 +730,34 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v11
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v9, v0, vcc
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v9, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v1, v4
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v1, v4
+; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v11, v4
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v13, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v13, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v11, v15, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v12, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v11, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -886,33 +890,35 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v5, v6, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v2, v9
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v6
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, v9, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v12, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v9, v13, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v8, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -1755,33 +1761,35 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v1, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v4
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v4
-; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, v7, v4
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v6, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v10, v9, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v7, v11, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v6, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -2197,29 +2205,31 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v7, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v1, v10
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v1, v10
+; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v7, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v6, v10
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v14, v13, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v12, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -2352,33 +2362,35 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_lo_u32 v5, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v2, v7
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v8
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v2, v8
+; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, v7, v8
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v6, v8
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v12, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v6, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index d10d911aeac07..ffba5af091758 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -8,135 +8,136 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdiv64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz BB0_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v3
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
-; GFX9-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
+; GFX9-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_mul_lo_u32 v10, v9, v6
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v13, v8, v6
-; GFX9-NEXT:    v_add3_u32 v10, v11, v12, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v6, v10
-; GFX9-NEXT:    v_mul_hi_u32 v14, v6, v13
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v10
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v14, v12
-; GFX9-NEXT:    v_mul_lo_u32 v14, v7, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v11, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v14
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v10
-; GFX9-NEXT:    v_mul_lo_u32 v10, v7, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[4:5], v6, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v10
-; GFX9-NEXT:    v_mul_hi_u32 v13, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v6
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v11
-; GFX9-NEXT:    v_add3_u32 v9, v13, v12, v9
-; GFX9-NEXT:    v_mul_lo_u32 v12, v6, v9
-; GFX9-NEXT:    v_mul_hi_u32 v13, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v14, v6, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT:    v_mul_hi_u32 v13, v10, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v16, v14, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v12, v8
-; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v14, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v12, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v16, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v0, v8
-; GFX9-NEXT:    v_xor_b32_e32 v9, v9, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
+; GFX9-NEXT:    v_mul_lo_u32 v12, v7, v5
+; GFX9-NEXT:    v_add3_u32 v9, v10, v11, v9
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v12
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v16, v6, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v13, v11
+; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v12
+; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v13
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
+; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
+; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v16, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
-; GFX9-NEXT:    v_mul_hi_u32 v13, v1, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v6
-; GFX9-NEXT:    v_mul_lo_u32 v13, v4, v6
-; GFX9-NEXT:    v_add3_u32 v10, v12, v11, v10
-; GFX9-NEXT:    v_sub_u32_e32 v11, v1, v10
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v13
-; GFX9-NEXT:    v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v12, s[4:5], v9, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[4:5], 0, v11, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v13, v12, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 2, v6
-; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[4:5], 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[4:5], 0, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v14, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v15, v13, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, v8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v11, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
+; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v15, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v11, v1, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v6
+; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v2, v5
+; GFX9-NEXT:    v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v8
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v11
+; GFX9-NEXT:    v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[4:5], 0, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v11, v10, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 2, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e64 v12, s[4:5], 1, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v12, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v13, v11, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v9, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v1, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v0, v2, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB0_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -144,6 +145,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -156,16 +158,15 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
 ; GFX9-NEXT:  BB0_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   ret i64 %d
@@ -261,33 +262,35 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v8, v7, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v7, v1, v6
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v9
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v9
 ; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v8, v2
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[4:5], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[4:5], 2, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, v5, s[4:5]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e64 v11, s[4:5], 1, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[4:5], 1, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[4:5], 0, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v12, v10, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v11, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v10, v8, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB1_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -326,133 +329,134 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: srem64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz BB2_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v15, 0
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v4
-; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
-; GFX9-NEXT:    v_mul_lo_u32 v12, v7, v4
-; GFX9-NEXT:    v_add3_u32 v9, v10, v11, v9
-; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v9
-; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v12
-; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v9
-; GFX9-NEXT:    v_mul_hi_u32 v16, v6, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v13, v11
-; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v12
-; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v4
+; GFX9-NEXT:    v_add3_u32 v8, v9, v10, v8
+; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v11
+; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v8
+; GFX9-NEXT:    v_mul_hi_u32 v15, v5, v8
+; GFX9-NEXT:    v_mul_lo_u32 v8, v5, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v12, v10
+; GFX9-NEXT:    v_mul_lo_u32 v12, v5, v11
+; GFX9-NEXT:    v_mul_hi_u32 v11, v5, v11
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v14, v9, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v13, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v14, v10, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
-; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
-; GFX9-NEXT:    v_mul_lo_u32 v11, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v16, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v13, v9, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
-; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v16, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v11, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v14, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, v8, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
+; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
+; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v15, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v16, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v11, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v12, v1, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v11
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v12, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v15, v12
+; GFX9-NEXT:    v_mul_hi_u32 v10, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v14, v16, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v13, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v4
-; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GFX9-NEXT:    v_add3_u32 v6, v10, v6, v9
-; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v6
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v9, v5, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v4, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v5
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v5, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
-; GFX9-NEXT:    v_sub_co_u32_e64 v12, s[4:5], v9, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v14, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v13, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v14, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GFX9-NEXT:    v_add3_u32 v5, v8, v5, v7
+; GFX9-NEXT:    v_sub_u32_e32 v7, v1, v5
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v7, v3, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[6:7], 0, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v7, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v10, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v6, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB2_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -460,7 +464,7 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -475,11 +479,11 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:  BB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i64 %a, %b
   ret i64 %d
@@ -575,32 +579,34 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v7, v5, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v6, v1, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v3, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v7, s[4:5], v4, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[6:7], 0, v6, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v8, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[4:5], v6, v3, s[4:5]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v6, v3, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[6:7], 0, v4, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[6:7]
-; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v7, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v6, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v7, v3
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[4:5], v4, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[4:5], 0, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v10, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB3_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -762,147 +768,148 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdivrem64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v7, v3
-; GFX9-NEXT:    v_or_b32_e32 v4, v1, v7
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
-; GFX9-NEXT:    ; implicit-def: $vgpr5_vgpr6
-; GFX9-NEXT:    ; implicit-def: $vgpr3_vgpr4
+; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz BB8_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, v4, v3
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v16, 0
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v7
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, 0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_mul_lo_u32 v10, v9, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v4
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v13, v8, v4
-; GFX9-NEXT:    v_add3_u32 v10, v11, v12, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v10
-; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v13
-; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v10
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v14, v12
-; GFX9-NEXT:    v_mul_lo_u32 v14, v7, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v11, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v14
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v10
-; GFX9-NEXT:    v_mul_lo_u32 v10, v7, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, vcc, v7, v11, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v10
-; GFX9-NEXT:    v_mul_hi_u32 v13, v8, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v11
-; GFX9-NEXT:    v_add3_u32 v9, v13, v12, v9
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v9
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT:    v_mul_hi_u32 v13, v10, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v14, vcc, v16, v14, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v12, v8
-; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v14, v13, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v12, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v16, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, vcc, v7, v8, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v0, v8
-; GFX9-NEXT:    v_xor_b32_e32 v9, v9, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v4
+; GFX9-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v6
+; GFX9-NEXT:    v_mul_lo_u32 v12, v7, v5
+; GFX9-NEXT:    v_add3_u32 v9, v10, v11, v9
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v12
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v16, v6, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v6, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v13, v11
+; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v12
+; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v13
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
+; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
+; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v16, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v12, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v13, v1, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v7
-; GFX9-NEXT:    v_mul_hi_u32 v12, v6, v4
-; GFX9-NEXT:    v_mul_lo_u32 v13, v6, v4
-; GFX9-NEXT:    v_add3_u32 v10, v12, v11, v10
-; GFX9-NEXT:    v_sub_u32_e32 v11, v1, v10
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v13
-; GFX9-NEXT:    v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v12, s[4:5], v9, v6
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v13, s[6:7], 0, v11, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v13, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[6:7]
-; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v7, s[6:7]
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v17, s[6:7], 1, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v18, s[6:7], 0, v7, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v18, v16, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[4:5], v11, v5, s[4:5]
-; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v12, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v16, vcc
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v12, v6, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v17, v15, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v10, v8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, v4, v10
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v8
-; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v10
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[8:9], v3, v10
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v8
-; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9]
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
+; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v15, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v11, v1, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v6
+; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v11, v2, v5
+; GFX9-NEXT:    v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v8
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v11
+; GFX9-NEXT:    v_subb_co_u32_e64 v9, s[4:5], v9, v3, vcc
+; GFX9-NEXT:    v_sub_co_u32_e64 v10, s[4:5], v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[6:7], 0, v9, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v11, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v11, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 2, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v6, s[6:7]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v15, s[6:7], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[6:7], 0, v6, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v16, v14, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v14, vcc
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v9, v3, s[4:5]
+; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v10, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v15, v13, s[6:7]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v8, v7, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v10, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v8
+; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[8:9], v6, v8, s[8:9]
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v7, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB8_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[10:11]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -910,7 +917,8 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
@@ -919,24 +927,23 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v0, v2
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
 ; GFX9-NEXT:  BB8_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   %r = srem i64 %a, %b
@@ -1036,40 +1043,42 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v8, v7, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v7, v1, v6
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v9
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v9
 ; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v9, s[4:5], v8, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[6:7], 0, v7, s[4:5]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v3
+; GFX9-NEXT:    v_sub_co_u32_e64 v8, s[4:5], v0, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[6:7], 0, v7, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[6:7]
+; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[6:7], 2, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[6:7], 0, v5, s[6:7]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], v9, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v11, s[6:7], 2, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, v5, s[6:7]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v14, s[6:7], 1, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[6:7], 0, v5, s[6:7]
+; GFX9-NEXT:    v_add_co_u32_e64 v13, s[6:7], 1, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, v5, s[6:7]
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v11
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v15, v13, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v14, v12, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v14, v12, s[6:7]
 ; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[4:5], v7, v3, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v9, v2
+; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[4:5], v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[6:7]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v13, v11, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v6, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v2, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX9-NEXT:  BB9_2: ; %Flow
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index b86db5a7ac689..eedb973f6d167 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -86,7 +86,7 @@ bb.outer.end:                                     ; preds = %bb.inner.then, %bb
 ; GCN-NEXT: s_cbranch_execz [[THEN_INNER:BB[0-9_]+]]
 ; GCN-NEXT: ; %bb.{{[0-9]+}}:
 ; GCN:      store_dword
-; GCN-NEXT: {{^}}[[THEN_INNER]]:
+; GCN:      {{^}}[[THEN_INNER]]:
 ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_INNER3:s\[[0-9:]+\]]], [[SAVEEXEC_INNER2]]
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_INNER3]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER]]
@@ -136,7 +136,7 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN:      store_dword
 ; GCN-NEXT: {{^}}[[THEN_OUTER_FLOW]]:
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_ELSE]]
-; GCN-NEXT: {{^}}[[THEN_OUTER]]:
+; GCN:      {{^}}[[THEN_OUTER]]:
 ; GCN-NEXT: s_or_saveexec_b64 [[SAVEEXEC_OUTER3:s\[[0-9:]+\]]], [[SAVEEXEC_OUTER2]]
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVEEXEC_OUTER3]]
 ; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d3d2b6949609c..a687d150914a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -330,6 +330,8 @@
 ; GCN-O1-NEXT:         Process Implicit Definitions
 ; GCN-O1-NEXT:         Remove unreachable machine basic blocks
 ; GCN-O1-NEXT:         Live Variable Analysis
+; GCN-O1-NEXT:         MachineDominator Tree Construction
+; GCN-O1-NEXT:         SI Optimize VGPR LiveRange
 ; GCN-O1-NEXT:         Eliminate PHI nodes for register allocation
 ; GCN-O1-NEXT:         SI Lower control flow pseudo instructions
 ; GCN-O1-NEXT:         Two-Address instruction pass
@@ -610,6 +612,7 @@
 ; GCN-O1-OPTS-NEXT:         Process Implicit Definitions
 ; GCN-O1-OPTS-NEXT:         Remove unreachable machine basic blocks
 ; GCN-O1-OPTS-NEXT:         Live Variable Analysis
+; GCN-O1-OPTS-NEXT:         SI Optimize VGPR LiveRange
 ; GCN-O1-OPTS-NEXT:         Eliminate PHI nodes for register allocation
 ; GCN-O1-OPTS-NEXT:         SI Lower control flow pseudo instructions
 ; GCN-O1-OPTS-NEXT:         Two-Address instruction pass
@@ -890,6 +893,7 @@
 ; GCN-O2-NEXT:         Process Implicit Definitions
 ; GCN-O2-NEXT:         Remove unreachable machine basic blocks
 ; GCN-O2-NEXT:         Live Variable Analysis
+; GCN-O2-NEXT:         SI Optimize VGPR LiveRange
 ; GCN-O2-NEXT:         Eliminate PHI nodes for register allocation
 ; GCN-O2-NEXT:         SI Lower control flow pseudo instructions
 ; GCN-O2-NEXT:         Two-Address instruction pass
@@ -1184,6 +1188,7 @@
 ; GCN-O3-NEXT:         Process Implicit Definitions
 ; GCN-O3-NEXT:         Remove unreachable machine basic blocks
 ; GCN-O3-NEXT:         Live Variable Analysis
+; GCN-O3-NEXT:         SI Optimize VGPR LiveRange
 ; GCN-O3-NEXT:         Eliminate PHI nodes for register allocation
 ; GCN-O3-NEXT:         SI Lower control flow pseudo instructions
 ; GCN-O3-NEXT:         Two-Address instruction pass

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 07c2575ab422f..5e9b8091c834d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -164,15 +164,16 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out,
 ; SI-NEXT:    s_cbranch_execz BB3_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v1, v[1:2], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; SI-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:  BB3_2: ; %Flow
 ; SI-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
 ; SI-NEXT:    s_xor_b64 exec, exec, s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index bc84fe8adab60..aec41958fe0a7 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1160,6 +1160,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; SI-NEXT:    s_cbranch_execz BB14_3
 ; SI-NEXT:  ; %bb.1: ; %kill
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    s_cbranch_scc0 BB14_6
 ; SI-NEXT:  ; %bb.2: ; %kill
 ; SI-NEXT:    s_mov_b64 exec, 0
@@ -1197,6 +1199,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE64-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-WAVE64-NEXT:    ; implicit-def: $vgpr0
+; GFX10-WAVE64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
@@ -1234,6 +1238,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE32-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-WAVE32-NEXT:    ; implicit-def: $vgpr0
+; GFX10-WAVE32-NEXT:    ; implicit-def: $vgpr1
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE32-NEXT:    s_mov_b32 exec_lo, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
new file mode 100644
index 0000000000000..aa721f9fcabcd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -stop-after=si-opt-vgpr-liverange -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; a normal if-else
+define amdgpu_ps float @else1(i32 %z, float %v) #0 {
+  ; SI-LABEL: name: else1
+  ; SI: bb.0.main_body:
+  ; SI:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; SI:   liveins: $vgpr0, $vgpr1
+  ; SI:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+  ; SI:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+  ; SI:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
+  ; SI:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.3
+  ; SI: bb.1.Flow:
+  ; SI:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; SI:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3
+  ; SI:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3
+  ; SI:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.2
+  ; SI: bb.2.if:
+  ; SI:   successors: %bb.4(0x80000000)
+  ; SI:   %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], [[PHI1]], implicit $mode, implicit $exec
+  ; SI:   S_BRANCH %bb.4
+  ; SI: bb.3.else:
+  ; SI:   successors: %bb.1(0x80000000)
+  ; SI:   %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, killed [[COPY]], implicit $mode, implicit $exec
+  ; SI:   S_BRANCH %bb.1
+  ; SI: bb.4.end:
+  ; SI:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2
+  ; SI:   SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   $vgpr0 = COPY killed [[PHI2]]
+  ; SI:   SI_RETURN_TO_EPILOG killed $vgpr0
+main_body:
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %v.if = fmul float %v, 2.0
+  br label %end
+
+else:
+  %v.else = fmul float %v, 3.0
+  br label %end
+
+end:
+  %r = phi float [ %v.if, %if ], [ %v.else, %else ]
+  ret float %r
+}
+
+
+; %v was used after if-else
+define amdgpu_ps float @else2(i32 %z, float %v) #0 {
+  ; SI-LABEL: name: else2
+  ; SI: bb.0.main_body:
+  ; SI:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; SI:   liveins: $vgpr0, $vgpr1
+  ; SI:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+  ; SI:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+  ; SI:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
+  ; SI:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.3
+  ; SI: bb.1.Flow:
+  ; SI:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; SI:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %15:vgpr_32, %bb.0, %4, %bb.3
+  ; SI:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.2
+  ; SI: bb.2.if:
+  ; SI:   successors: %bb.4(0x80000000)
+  ; SI:   %3:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[COPY]], [[COPY]], implicit $mode, implicit $exec
+  ; SI:   S_BRANCH %bb.4
+  ; SI: bb.3.else:
+  ; SI:   successors: %bb.1(0x80000000)
+  ; SI:   %4:vgpr_32 = nofpexcept V_MUL_F32_e32 1077936128, [[COPY]], implicit $mode, implicit $exec
+  ; SI:   S_BRANCH %bb.1
+  ; SI: bb.4.end:
+  ; SI:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.1, %3, %bb.2
+  ; SI:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI]], %bb.1, %3, %bb.2
+  ; SI:   SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   %14:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI1]], killed [[PHI2]], implicit $mode, implicit $exec
+  ; SI:   $vgpr0 = COPY killed %14
+  ; SI:   SI_RETURN_TO_EPILOG killed $vgpr0
+main_body:
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %v.if = fmul float %v, 2.0
+  br label %end
+
+else:
+  %v.else = fmul float %v, 3.0
+  br label %end
+
+end:
+  %r0 = phi float [ %v.if, %if ], [ %v, %else ]
+  %r1 = phi float [ %v.if, %if ], [ %v.else, %else ]
+  %r2 = fadd float %r0, %r1
+  ret float %r2
+}
+
+; if-else inside loop, %x can be optimized, but %v cannot be.
+define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
+  ; SI-LABEL: name: else3
+  ; SI: bb.0.entry:
+  ; SI:   successors: %bb.1(0x80000000)
+  ; SI:   liveins: $vgpr0, $vgpr1, $sgpr0, $vgpr2
+  ; SI:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr2
+  ; SI:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY killed $sgpr0
+  ; SI:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
+  ; SI:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
+  ; SI:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 6, killed [[COPY3]], implicit $exec
+  ; SI:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; SI: bb.1.for.body:
+  ; SI:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; SI:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5
+  ; SI:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5
+  ; SI:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.4
+  ; SI: bb.2.Flow:
+  ; SI:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; SI:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.1, %10, %bb.4
+  ; SI:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.1, %9, %bb.4
+  ; SI:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %40:vgpr_32, %bb.4
+  ; SI:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   S_BRANCH %bb.3
+  ; SI: bb.3.if:
+  ; SI:   successors: %bb.5(0x80000000)
+  ; SI:   %7:vgpr_32 = nofpexcept V_MUL_F32_e32 [[PHI]], [[COPY2]], implicit $mode, implicit $exec
+  ; SI:   %8:vgpr_32, dead %32:sreg_64 = V_ADD_CO_U32_e64 1, killed [[PHI4]], 0, implicit $exec
+  ; SI:   S_BRANCH %bb.5
+  ; SI: bb.4.else:
+  ; SI:   successors: %bb.2(0x80000000)
+  ; SI:   %9:vgpr_32 = nofpexcept V_MUL_F32_e32 [[COPY2]], [[PHI1]], implicit $mode, implicit $exec
+  ; SI:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
+  ; SI:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_MUL_LO_U32_e64_]]
+  ; SI:   S_BRANCH %bb.2
+  ; SI: bb.5.if.end:
+  ; SI:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; SI:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI [[PHI3]], %bb.2, %7, %bb.3
+  ; SI:   [[PHI6:%[0-9]+]]:vgpr_32 = PHI [[PHI2]], %bb.2, %8, %bb.3
+  ; SI:   SI_END_CF killed [[SI_ELSE]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI:   %13:vgpr_32, dead %34:sreg_64 = V_ADD_CO_U32_e64 1, [[PHI6]], 0, implicit $exec
+  ; SI:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 killed [[PHI]], 1, implicit-def dead $scc
+  ; SI:   S_CMP_LT_I32 [[S_ADD_I32_]], [[COPY1]], implicit-def $scc
+  ; SI:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; SI:   S_BRANCH %bb.6
+  ; SI: bb.6.for.end:
+  ; SI:   %35:vgpr_32 = nofpexcept V_ADD_F32_e32 killed [[PHI6]], killed [[PHI5]], implicit $mode, implicit $exec
+  ; SI:   $vgpr0 = COPY killed %35
+  ; SI:   SI_RETURN_TO_EPILOG killed $vgpr0
+entry:
+;  %break = icmp sgt i32 %bound, 0
+;  br i1 %break, label %for.body, label %for.end
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ]
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %i.tmp = bitcast i32 %i to float
+  %v.if = fmul float %v, %i.tmp
+  %x.if = add i32 %x, 1
+  br label %if.end
+
+else:
+  %x.tmp = bitcast i32 %x to float
+  %v.else = fmul float %v, %x.tmp
+  %x.else = mul i32 %x, 3
+  br label %if.end
+
+if.end:
+  %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ]
+  %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ]
+
+  %xinc = add i32 %x.endif, 1
+  %inc = add i32 %i, 1
+  %cond = icmp slt i32 %inc, %bound
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %x_float = bitcast i32 %x.endif to float
+  %r = fadd float %x_float, %v.endif
+  ret float %r
+}
+
+attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
new file mode 100644
index 0000000000000..0b4859eba68c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-opt-vgpr-liverange=true -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; a normal if-else
+define amdgpu_ps float @else1(i32 %z, float %v) #0 {
+; SI-LABEL: else1:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; SI-NEXT:  ; %bb.1: ; %else
+; SI-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:  ; %bb.2: ; %Flow
+; SI-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; SI-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; SI-NEXT:  ; %bb.3: ; %if
+; SI-NEXT:    v_add_f32_e32 v0, v1, v1
+; SI-NEXT:  ; %bb.4: ; %end
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    ; return to shader part epilog
+main_body:
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %v.if = fmul float %v, 2.0
+  br label %end
+
+else:
+  %v.else = fmul float %v, 3.0
+  br label %end
+
+end:
+  %r = phi float [ %v.if, %if ], [ %v.else, %else ]
+  ret float %r
+}
+
+
+; %v was used after if-else
+define amdgpu_ps float @else2(i32 %z, float %v) #0 {
+; SI-LABEL: else2:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; SI-NEXT:  ; %bb.1: ; %else
+; SI-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
+; SI-NEXT:  ; %bb.2: ; %Flow
+; SI-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
+; SI-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; SI-NEXT:  ; %bb.3: ; %if
+; SI-NEXT:    v_add_f32_e32 v1, v1, v1
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:  ; %bb.4: ; %end
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-NEXT:    ; return to shader part epilog
+main_body:
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %v.if = fmul float %v, 2.0
+  br label %end
+
+else:
+  %v.else = fmul float %v, 3.0
+  br label %end
+
+end:
+  %r0 = phi float [ %v.if, %if ], [ %v, %else ]
+  %r1 = phi float [ %v.if, %if ], [ %v.else, %else ]
+  %r2 = fadd float %r0, %r1
+  ret float %r2
+}
+
+; if-else inside loop, %x can be optimized, but %v cannot be.
+define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
+; SI-LABEL: else3:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 6, v0
+; SI-NEXT:    s_mov_b32 s1, 0
+; SI-NEXT:    s_branch BB2_2
+; SI-NEXT:  BB2_1: ; %if.end
+; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_add_i32 s1, s1, 1
+; SI-NEXT:    s_cmp_lt_i32 s1, s0
+; SI-NEXT:    v_add_u32_e64 v2, s[2:3], 1, v0
+; SI-NEXT:    s_cbranch_scc0 BB2_6
+; SI-NEXT:  BB2_2: ; %for.body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:  ; %bb.3: ; %else
+; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT:    v_mul_lo_u32 v0, v2, 3
+; SI-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:  ; %bb.4: ; %Flow
+; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB2_1
+; SI-NEXT:  ; %bb.5: ; %if
+; SI-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; SI-NEXT:    v_mul_f32_e32 v3, s1, v1
+; SI-NEXT:    v_add_u32_e64 v0, s[2:3], 1, v2
+; SI-NEXT:    s_branch BB2_1
+; SI-NEXT:  BB2_6: ; %for.end
+; SI-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-NEXT:    ; return to shader part epilog
+entry:
+;  %break = icmp sgt i32 %bound, 0
+;  br i1 %break, label %for.body, label %for.end
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %x = phi i32 [ %x0, %entry ], [ %xinc, %if.end ]
+  %cc = icmp sgt i32 %z, 5
+  br i1 %cc, label %if, label %else
+
+if:
+  %i.tmp = bitcast i32 %i to float
+  %v.if = fmul float %v, %i.tmp
+  %x.if = add i32 %x, 1
+  br label %if.end
+
+else:
+  %x.tmp = bitcast i32 %x to float
+  %v.else = fmul float %v, %x.tmp
+  %x.else = mul i32 %x, 3
+  br label %if.end
+
+if.end:
+  %v.endif = phi float [ %v.if, %if ], [ %v.else, %else ]
+  %x.endif = phi i32 [ %x.if, %if ], [ %x.else, %else ]
+
+  %xinc = add i32 %x.endif, 1
+  %inc = add i32 %i, 1
+  %cond = icmp slt i32 %inc, %bound
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %x_float = bitcast i32 %x.endif to float
+  %r = fadd float %x_float, %v.endif
+  ret float %r
+}
+
+attributes #0 = { nounwind }


        


More information about the llvm-commits mailing list