[llvm] a321d95 - [AMDGPU] avoid blind converting to VALU REG_SEQUENCE and PHIs

Alexander Timofeev via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 2 09:38:22 PDT 2022


Author: Alexander Timofeev
Date: 2022-08-02T18:37:57+02:00
New Revision: a321d95b597000a7b10418a9e7098344900b1111

URL: https://github.com/llvm/llvm-project/commit/a321d95b597000a7b10418a9e7098344900b1111
DIFF: https://github.com/llvm/llvm-project/commit/a321d95b597000a7b10418a9e7098344900b1111.diff

LOG: [AMDGPU] avoid blind converting to VALU REG_SEQUENCE and PHIs

In the 2e29b0138ca243 we introduce a specific solving algorithm
that analyzes the VGPR to SGPR copies use chains and either lowers
the copy to v_readfirstlane_b32 or converts the whole chain to VALU forms.
Same time we still have the code that blindly converts to VALU REG_SEQUENCE and PHIs
in case they produce SGPR but have VGPRs input operands. In case the REG_SEQUENCE and PHIs
are in the VGPR to SGPR copy use chain, and this chain was considered long enough to convert
copy to v_readfistlane_b32, further lowering them to VALU leads to several kinds of issues.
At first, we have v_readfistlane_b32 which is completely useless because most parts of its use chain
were moved to VALU forms. Second, we may encounter subtle bugs related to the EXEC-dependent CF
because of the weird mixing of SALU and VALU instructions.
This change removes the code that moves REG_SEQUENCE and PHIs to VALU. Instead, we use the fact
that both REG_SEQUENCE and PHIs have copy semantics. That is, if they define SGPR but have VGPR inputs,
we insert VGPR to SGPR copies to make them pure SGPR. Then, the new copies are processed by the common
VGPR to SGPR lowering algorithm.
This is Part 2 in the series of commits aiming at the massive refactoring of the SIFixSGPRCopies pass.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130367

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
    llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
    llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
    llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 3e95c55df57e..b77499e0fee9 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -82,9 +82,47 @@ static cl::opt<bool> EnableM0Merge(
   cl::init(true));
 
 namespace {
+
+class V2SCopyInfo {
+public:
+  // VGPR to SGPR copy being processed
+  MachineInstr *Copy;
+  // All SALU instructions reachable from this copy in SSA graph
+  DenseSet<MachineInstr *> SChain;
+  // Number of SGPR to VGPR copies that are used to put the SALU computation
+  // results back to VALU.
+  unsigned NumSVCopies;
+
+  unsigned Score;
+  // Actual count of v_readfirstlane_b32
+  // which need to be inserted to keep SChain SALU
+  unsigned NumReadfirstlanes;
+  // Current score state. To speedup selection V2SCopyInfos for processing
+  bool NeedToBeConvertedToVALU = false;
+  // Unique ID. Used as a key for mapping to keep permanent order.
+  unsigned ID;
+
+  // Count of another VGPR to SGPR copies that contribute to the
+  // current copy SChain
+  unsigned SiblingPenalty = 0;
+  SetVector<unsigned> Siblings;
+  V2SCopyInfo() : Copy(nullptr), ID(0){};
+  V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
+      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() {
+    dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
+           << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
+           << "\nScore: " << Score << "\n";
+  }
+#endif
+};
+
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
   unsigned NextVGPRToSGPRCopyID;
+  DenseMap<unsigned, V2SCopyInfo> V2SCopies;
+  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
 
 public:
   static char ID;
@@ -97,6 +135,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
+  void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info);
   void lowerVGPR2SGPRCopies(MachineFunction &MF);
   // Handles copies which source register is:
   // 1. Physical register
@@ -104,7 +143,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
   // 3. Defined by the instruction the merely moves the immediate
   bool lowerSpecialCase(MachineInstr &MI);
 
-  MachineBasicBlock *processPHINode(MachineInstr &MI);
+  void processPHINode(MachineInstr &MI);
 
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
@@ -663,15 +702,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       case AMDGPU::PHI: {
-        MachineBasicBlock *NewBB = processPHINode(MI);
-        if (NewBB && NewBB != MBB) {
-          MBB = NewBB;
-          E = MBB->end();
-          BI = MachineFunction::iterator(MBB);
-          BE = MF.end();
-        }
-        assert((!NewBB || NewBB == I->getParent()) &&
-               "moveToVALU did not return the right basic block");
+        processPHINode(MI);
         break;
       }
       case AMDGPU::REG_SEQUENCE: {
@@ -681,17 +712,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
-
-        MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-        if (NewBB && NewBB != MBB) {
-          MBB = NewBB;
-          E = MBB->end();
-          BI = MachineFunction::iterator(MBB);
-          BE = MF.end();
-        }
-        assert((!NewBB || NewBB == I->getParent()) &&
-               "moveToVALU did not return the right basic block");
         break;
       }
       case AMDGPU::INSERT_SUBREG: {
@@ -783,58 +803,42 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
+  SiblingPenalty.clear();
+  V2SCopies.clear();
+
   return true;
 }
 
-MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
-  unsigned numVGPRUses = 0;
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
   SmallSet<const MachineInstr *, 4> Visited;
   SetVector<MachineInstr *> PHIOperands;
-  MachineBasicBlock *CreatedBB = nullptr;
   worklist.insert(&MI);
   Visited.insert(&MI);
+  // HACK to make MIR tests with no uses happy
+  bool HasUses = false;
   while (!worklist.empty()) {
     const MachineInstr *Instr = worklist.pop_back_val();
     Register Reg = Instr->getOperand(0).getReg();
     for (const auto &Use : MRI->use_operands(Reg)) {
+      HasUses = true;
       const MachineInstr *UseMI = Use.getParent();
       AllAGPRUses &= (UseMI->isCopy() &&
                       TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
                      TRI->isAGPR(*MRI, Use.getReg());
       if (UseMI->isCopy() || UseMI->isRegSequence()) {
-        if (UseMI->isCopy() &&
-          UseMI->getOperand(0).getReg().isPhysical() &&
-          !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
-          numVGPRUses++;
-        }
         if (Visited.insert(UseMI).second)
           worklist.insert(UseMI);
 
         continue;
       }
-
-      if (UseMI->isPHI()) {
-        const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
-        if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
-          UseRC != &AMDGPU::VReg_1RegClass)
-          numVGPRUses++;
-        continue;
-      }
-
-      const TargetRegisterClass *OpRC =
-        TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
-      if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
-        OpRC != &AMDGPU::VS_64RegClass) {
-        numVGPRUses++;
-      }
     }
   }
 
   Register PHIRes = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
-  if (AllAGPRUses && numVGPRUses && !TRI->isAGPRClass(RC0)) {
+  if (HasUses && AllAGPRUses && !TRI->isAGPRClass(RC0)) {
     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
     for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
@@ -844,49 +848,8 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     }
   }
 
-  bool hasVGPRInput = false;
-  for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-    Register InputReg = MI.getOperand(i).getReg();
-    MachineInstr *Def = MRI->getVRegDef(InputReg);
-    if (TRI->isVectorRegister(*MRI, InputReg)) {
-      if (Def->isCopy()) {
-        Register SrcReg = Def->getOperand(1).getReg();
-        const TargetRegisterClass *RC =
-          TRI->getRegClassForReg(*MRI, SrcReg);
-        if (TRI->isSGPRClass(RC))
-          continue;
-      }
-      hasVGPRInput = true;
-      break;
-    }
-    else if (Def->isCopy() &&
-      TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
-      Register SrcReg = Def->getOperand(1).getReg();
-      MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
-      unsigned SMovOp;
-      int64_t Imm;
-      if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
-        hasVGPRInput = true;
-        break;
-      } else {
-        // Formally, if we did not do this right away
-        // it would be done on the next iteration of the
-        // runOnMachineFunction main loop. But why not if we can?
-        MachineFunction *MF = MI.getParent()->getParent();
-        Def->getOperand(1).ChangeToImmediate(Imm);
-        Def->addImplicitDefUseOperands(*MF);
-        Def->setDesc(TII->get(SMovOp));
-      }
-    }
-  }
-
-  if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
-       RC0 != &AMDGPU::VReg_1RegClass) &&
-    (hasVGPRInput || numVGPRUses > 1)) {
-    LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-    CreatedBB = TII->moveToVALU(MI);
-  }
-  else {
+  if (TRI->isVectorRegister(*MRI, PHIRes) ||
+       RC0 == &AMDGPU::VReg_1RegClass) {
     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
     TII->legalizeOperands(MI, MDT);
   }
@@ -895,18 +858,9 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   while (!PHIOperands.empty()) {
     processPHINode(*PHIOperands.pop_back_val());
   }
-  return CreatedBB;
 }
 
 bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
-  MachineBasicBlock *MBB = MI.getParent();
-  const TargetRegisterClass *SrcRC, *DstRC;
-  std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
-
-  // We return true to indicate that no further processing needed
-  if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
-    return true;
-
   Register SrcReg = MI.getOperand(1).getReg();
   if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
     TII->moveToVALU(MI, MDT);
@@ -919,73 +873,89 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
   // s_mov_b32.
   if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
     MI.getOperand(1).ChangeToImmediate(Imm);
-    MI.addImplicitDefUseOperands(*MBB->getParent());
+    MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
     MI.setDesc(TII->get(SMovOp));
     return true;
   }
   return false;
 }
 
-class V2SCopyInfo {
-public:
-  // VGPR to SGPR copy being processed
-  MachineInstr *Copy;
-  // All SALU instructions reachable from this copy in SSA graph
-  DenseSet<MachineInstr *> SChain;
-  // Number of SGPR to VGPR copies that are used to put the SALU computation
-  // results back to VALU.
-  unsigned NumSVCopies;
 
-  unsigned Score;
-  // Actual count of v_readfirstlane_b32
-  // which need to be inserted to keep SChain SALU
-  unsigned NumReadfirstlanes;
-  // Current score state. To speedup selection V2SCopyInfos for processing
-  bool NeedToBeConvertedToVALU = false;
-  // Unique ID. Used as a key for mapping to keep permanent order.
-  unsigned ID;
 
-  // Count of another VGPR to SGPR copies that contribute to the
-  // current copy SChain
-  unsigned SiblingPenalty = 0;
-  SetVector<unsigned> Siblings;
-  V2SCopyInfo() : Copy(nullptr), ID(0){};
-  V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
-      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() {
-    dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
-           << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
-           << "\nScore: " << Score << "\n";
+void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) {
+  SmallVector<MachineInstr *, 8> AnalysisWorklist;
+  // Needed because the SSA is not a tree but a graph and may have
+  // forks and joins. We should not then go same way twice.
+  DenseSet<MachineInstr *> Visited;
+  AnalysisWorklist.push_back(Info.Copy);
+  while (!AnalysisWorklist.empty()) {
+
+    MachineInstr *Inst = AnalysisWorklist.pop_back_val();
+
+    if (!Visited.insert(Inst).second)
+      continue;
+
+    // Copies and REG_SEQUENCE do not contribute to the final assembly
+    // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
+    if (Inst->isCopy() || Inst->isRegSequence()) {
+      if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+        if (!Inst->isCopy() ||
+            !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+          Info.NumSVCopies++;
+          continue;
+        }
+      }
+    }
+
+    SiblingPenalty[Inst].insert(Info.ID);
+
+    SmallVector<MachineInstr *, 4> Users;
+    if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
+        (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
+      auto I = Inst->getIterator();
+      auto E = Inst->getParent()->end();
+      while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
+        if (I->readsRegister(AMDGPU::SCC))
+          Users.push_back(&*I);
+      }
+    } else if (Inst->getNumExplicitDefs() != 0) {
+      Register Reg = Inst->getOperand(0).getReg();
+      if (TRI->isSGPRReg(*MRI, Reg) && !TII->isVALU(*Inst))
+        for (auto &U : MRI->use_instructions(Reg))
+          Users.push_back(&U);
+    }
+    for (auto U : Users) {
+      if (TII->isSALU(*U))
+        Info.SChain.insert(U);
+      AnalysisWorklist.push_back(U);
+    }
   }
-#endif
-};
+}
 
 void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
 
-  DenseMap<unsigned, V2SCopyInfo> Copies;
-  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
-
   // The main function that computes the VGPR to SGPR copy score
   // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
   auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
-    if (I->SChain.empty())
+    if (I->SChain.empty()) {
+      I->Score = 0;
       return true;
+    }
     I->Siblings = SiblingPenalty[*std::max_element(
         I->SChain.begin(), I->SChain.end(),
         [&](MachineInstr *A, MachineInstr *B) -> bool {
           return SiblingPenalty[A].size() < SiblingPenalty[B].size();
         })];
     I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
-    // The loop below computes the number of another VGPR to SGPR copies
+    // The loop below computes the number of another VGPR to SGPR V2SCopies
     // which contribute to the current copy SALU chain. We assume that all the
-    // copies with the same source virtual register will be squashed to one by
-    // regalloc. Also we take careof the copies of the 
diff ernt subregs of the
-    // same register.
+    // V2SCopies with the same source virtual register will be squashed to one
+    // by regalloc. Also we take care of the V2SCopies of the 
diff ernt subregs
+    // of the same register.
     SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
     for (auto J : I->Siblings) {
-      auto InfoIt = Copies.find(J);
-      if (InfoIt != Copies.end()) {
+      auto InfoIt = V2SCopies.find(J);
+      if (InfoIt != V2SCopies.end()) {
         MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
         if (SiblingCopy->isImplicitDef())
           // the COPY has already been MoveToVALUed
@@ -1012,97 +982,106 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
     case AMDGPU::STRICT_WQM:
     case AMDGPU::SOFT_WQM:
     case AMDGPU::STRICT_WWM:
+    case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::PHI:
       return true;
     default:
       return false;
     }
   };
 
+  SmallSet<MachineInstr *, 4> OutOfOrderProcessedCopies;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
        ++BI) {
     MachineBasicBlock *MBB = &*BI;
     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
          ++I) {
-      MachineInstr &MI = *I;
-      if (!needProcessing(MI))
-        continue;
-      if (lowerSpecialCase(MI))
+      MachineInstr *MI = &*I;
+      if (!needProcessing(*MI))
         continue;
 
-      // Compute the COPY width to pass it to V2SCopyInfo Ctor
-      Register DstReg = MI.getOperand(0).getReg();
+      if (MI->isRegSequence() || MI->isPHI()) {
+        MachineBasicBlock::iterator J = I;
+        if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) {
+          for (MachineOperand &MO : MI->operands()) {
+            if (!MO.isReg() || !MO.getReg().isVirtual())
+              continue;
+            const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());
+            if (TRI->hasVectorRegisters(SrcRC)) {
+              const TargetRegisterClass *DestRC =
+                  TRI->getEquivalentSGPRClass(SrcRC);
+              Register NewDst = MRI->createVirtualRegister(DestRC);
+              MachineBasicBlock *BlockToInsertCopy = MBB;
+              MachineBasicBlock::iterator PointToInsertCopy = I;
+              if (MI->isPHI()) {
+                BlockToInsertCopy =
+                    MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB();
+                PointToInsertCopy =
+                    BlockToInsertCopy->getFirstInstrTerminator();
+              }
+              MachineBasicBlock::iterator NewI =
+                  BuildMI(*BlockToInsertCopy, PointToInsertCopy,
+                          PointToInsertCopy->getDebugLoc(),
+                          TII->get(AMDGPU::COPY), NewDst)
+                      .addReg(MO.getReg());
+              MO.setReg(NewDst);
+              if (!MI->isPHI()) {
+                I = NewI;
+                MI = &*I;
+              } else {
+                // We insert the copy into the basic block that may have been
+                // already processed. Pass it to the analysis explicitly.
+                V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
+                               TRI->getRegSizeInBits(*DestRC));
+                analyzeVGPRToSGPRCopy(In);
+                V2SCopies[In.ID] = In;
+                OutOfOrderProcessedCopies.insert(MI);
+              }
+            }
+          }
+        }
 
-      const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);
+        if (J == I)
+          continue;
+      }
 
-      V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
-                     TRI->getRegSizeInBits(*DstRC));
+      const TargetRegisterClass *SrcRC, *DstRC;
+      std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI);
 
-      SmallVector<MachineInstr *, 8> AnalysisWorklist;
-      // Needed because the SSA is not a tree but a graph and may have
-      // forks and joins. We should not then go same way twice.
-      DenseSet<MachineInstr *> Visited;
-      AnalysisWorklist.push_back(&MI);
-      while (!AnalysisWorklist.empty()) {
+      if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+        continue;
 
-        MachineInstr *Inst = AnalysisWorklist.pop_back_val();
+      if (lowerSpecialCase(*MI))
+        continue;
 
-        if (!Visited.insert(Inst).second)
-          continue;
+      if (OutOfOrderProcessedCopies.contains(MI))
+        continue;
 
-        // Copies and REG_SEQUENCE do not contribute to the final assembly
-        // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
-        if (Inst->isCopy() || Inst->isRegSequence()) {
-          if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
-            if (!Inst->isCopy() ||
-                !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
-              In.NumSVCopies++;
-              continue;
-            }
-          }
-        }
+      V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI,
+                     TRI->getRegSizeInBits(*DstRC));
 
-        SiblingPenalty[Inst].insert(In.ID);
+      analyzeVGPRToSGPRCopy(In);
 
-        SmallVector<MachineInstr *, 4> Users;
-        if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
-            (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
-          auto I = Inst->getIterator();
-          auto E = Inst->getParent()->end();
-          while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
-            if (I->readsRegister(AMDGPU::SCC))
-              Users.push_back(&*I);
-          }
-        } else if (Inst->getNumExplicitDefs() != 0) {
-          Register Reg = Inst->getOperand(0).getReg();
-          if (TRI->isSGPRReg(*MRI, Reg))
-            for (auto &U : MRI->use_instructions(Reg))
-              Users.push_back(&U);
-        }
-        for (auto U : Users) {
-          if (TII->isSALU(*U))
-            In.SChain.insert(U);
-          AnalysisWorklist.push_back(U);
-        }
-      }
-      Copies[In.ID] = In;
+      V2SCopies[In.ID] = In;
     }
   }
 
   SmallVector<unsigned, 8> LoweringWorklist;
-  for (auto &C : Copies) {
+  for (auto &C : V2SCopies) {
     if (needToBeConvertedToVALU(&C.second))
       LoweringWorklist.push_back(C.second.ID);
   }
 
   while (!LoweringWorklist.empty()) {
     unsigned CurID = LoweringWorklist.pop_back_val();
-    auto CurInfoIt = Copies.find(CurID);
-    if (CurInfoIt != Copies.end()) {
+    auto CurInfoIt = V2SCopies.find(CurID);
+    if (CurInfoIt != V2SCopies.end()) {
       V2SCopyInfo C = CurInfoIt->getSecond();
       LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
       for (auto S : C.Siblings) {
-        auto SibInfoIt = Copies.find(S);
-        if (SibInfoIt != Copies.end()) {
+        auto SibInfoIt = V2SCopies.find(S);
+        if (SibInfoIt != V2SCopies.end()) {
           V2SCopyInfo &SI = SibInfoIt->getSecond();
           LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
           if (!SI.NeedToBeConvertedToVALU) {
@@ -1115,13 +1094,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
       }
       LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
                         << " is being turned to VALU\n");
-      Copies.erase(C.ID);
+      V2SCopies.erase(C.ID);
       TII->moveToVALU(*C.Copy, MDT);
     }
   }
 
   // Now do actual lowering
-  for (auto C : Copies) {
+  for (auto C : V2SCopies) {
     MachineInstr *MI = C.second.Copy;
     MachineBasicBlock *MBB = MI->getParent();
     // We decide to turn V2S copy to v_readfirstlane_b32

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index ad3b0d08ca0d..bce6ba45c312 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1498,7 +1498,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
@@ -1520,7 +1520,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
@@ -1531,7 +1531,7 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
@@ -1837,7 +1837,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s9
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
@@ -1887,7 +1887,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    s_add_i32 s1, s6, s0
 ; GFX6-NEXT:    s_xor_b32 s5, s5, s2
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
 ; GFX6-NEXT:    s_xor_b32 s3, s0, s3
@@ -1909,7 +1909,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -3005,7 +3005,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
@@ -3021,7 +3021,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s6, s4
@@ -3050,7 +3050,7 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -3273,7 +3273,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
@@ -3312,7 +3312,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
@@ -3328,7 +3328,7 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s6
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
@@ -3626,7 +3626,7 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3710,7 +3710,7 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -4196,7 +4196,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
@@ -4212,7 +4212,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
@@ -4225,7 +4225,7 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4403,7 +4403,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
@@ -4439,7 +4439,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -4826,7 +4826,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s6, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
@@ -5029,7 +5029,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -5042,7 +5042,7 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
@@ -5234,7 +5234,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s8
 ; GFX6-NEXT:    s_bfe_i32 s5, s8, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s5
@@ -5275,7 +5275,7 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_lshr_b32 s7, s8, 15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s7
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v5
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
@@ -5550,7 +5550,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -5669,9 +5669,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
@@ -6000,9 +6000,9 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
@@ -6187,7 +6187,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -7079,7 +7079,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -7117,9 +7117,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0x11f
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -7467,7 +7467,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7752,7 +7752,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7787,8 +7787,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x11f
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
@@ -8198,9 +8198,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -8852,7 +8852,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    s_mov_b32 s11, s10
 ; GFX6-NEXT:    s_addc_u32 s1, s3, s10
 ; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -9160,9 +9160,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s13
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -9280,9 +9280,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
@@ -9982,7 +9982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -10354,7 +10354,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -10468,7 +10468,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 535173bb0f9e..5077ddf894c3 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -13,17 +13,17 @@ define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_addk_i32 s0, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_and_b64 s[0:1], exec, -1
+; GCN-NEXT:    s_and_b64 vcc, exec, -1
 ; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:  .LBB0_2: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    ds_write_b32 v0, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
-; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_add_i32 s0, s0, 4
+; GCN-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN-NEXT:  .LBB0_3: ; %for.exit
 ; GCN-NEXT:    s_endpgm
@@ -92,15 +92,15 @@ define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nou
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_addk_i32 s0, 0x80
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:  .LBB1_1: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    ds_write_b32 v0, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NEXT:    s_add_i32 s0, s0, 4
 ; GCN-NEXT:    s_branch .LBB1_1
 ;
 ; GCN_DBG-LABEL: loop_const_true:
@@ -291,23 +291,23 @@ define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    ds_read_u8 v0, v0
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_addk_i32 s2, 0x80
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GCN-NEXT:    s_add_i32 s0, s4, 0x80
+; GCN-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN-NEXT:  .LBB4_1: ; %for.body
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    ds_write_b32 v0, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
-; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_add_i32 s0, s0, 4
+; GCN-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-NEXT:    s_cbranch_vccz .LBB4_1
 ; GCN-NEXT:  ; %bb.2: ; %for.exit
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 0297ddbcb04a..4a7a328d33c6 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -975,7 +975,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index 02966a2a5942..84b315b80031 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,ADDR64
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -verify-machine-dom-info --run-pass=si-fix-sgpr-copies -o - %s | FileCheck %s --check-prefixes=W64,W64-NO-ADDR64
@@ -12,58 +13,7 @@
 # on non-ADDR64 hardware, we emit a waterfall loop.
 
 
-# W64-LABEL: name: idxen
-# W64-LABEL:  bb.0:
-# W64-NEXT: successors: %bb.1({{.*}})
-# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-# W64-LABEL: bb.1:
-# W64-NEXT: successors: %bb.2({{.*}})
-# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
-# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64-LABEL: bb.2:
-# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
-# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W64-LABEL:  bb.3:
-# W64: $exec = S_MOV_B64 [[SAVEEXEC]]
-
-# W32-LABEL: name: idxen
-# W32-LABEL:  bb.0:
-# W32-NEXT: successors: %bb.1({{.*}})
-# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
-# W32-LABEL: bb.1:
-# W32-NEXT: successors: %bb.2({{.*}})
-# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
-# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32-LABEL: bb.2:
-# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
 # TODO: S_XOR_B32_term should be `implicit-def $scc`
-# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
-# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W32-LABEL:  bb.3:
-# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]]
 ---
 name:            idxen
 liveins:
@@ -76,6 +26,100 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-LABEL: name: idxen
+    ; W64: successors: %bb.1(0x80000000)
+    ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .1:
+    ; W64-NEXT: successors: %bb.2(0x80000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .2:
+    ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .3:
+    ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
+    ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]]
+    ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W32-LABEL: name: idxen
+    ; W32: successors: %bb.1(0x80000000)
+    ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .1:
+    ; W32-NEXT: successors: %bb.2(0x80000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .2:
+    ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+    ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .3:
+    ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+    ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]]
+    ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %5:sreg_64 = COPY $sgpr30_sgpr31
     %4:vgpr_32 = COPY $vgpr4
     %3:vgpr_32 = COPY $vgpr3
@@ -89,58 +133,7 @@ body:             |
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
 ...
 
-# W64-LABEL: name: offen
-# W64-LABEL:  bb.0:
-# W64-NEXT: successors: %bb.1({{.*}})
-# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-# W64-LABEL: bb.1:
-# W64-NEXT: successors: %bb.2({{.*}})
-# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
-# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64-LABEL: bb.2:
-# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
-# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W64-LABEL:  bb.3:
-# W64: $exec = S_MOV_B64 [[SAVEEXEC]]
 
-# W32-LABEL: name: offen
-# W32-LABEL:  bb.0:
-# W32-NEXT: successors: %bb.1({{.*}})
-# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
-# W32-LABEL: bb.1:
-# W32-NEXT: successors: %bb.2({{.*}})
-# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
-# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32-LABEL: bb.2:
-# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# TODO: S_XOR_B32_term should be `implicit-def $scc`
-# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
-# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W32-LABEL:  bb.3:
-# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]]
 ---
 name:            offen
 liveins:
@@ -153,6 +146,100 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-LABEL: name: offen
+    ; W64: successors: %bb.1(0x80000000)
+    ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W64-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .1:
+    ; W64-NEXT: successors: %bb.2(0x80000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .2:
+    ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .3:
+    ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
+    ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
+    ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W32-LABEL: name: offen
+    ; W32: successors: %bb.1(0x80000000)
+    ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W32-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3
+    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .1:
+    ; W32-NEXT: successors: %bb.2(0x80000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .2:
+    ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+    ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .3:
+    ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+    ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
+    ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %5:sreg_64 = COPY $sgpr30_sgpr31
     %4:vgpr_32 = COPY $vgpr4
     %3:vgpr_32 = COPY $vgpr3
@@ -166,58 +253,7 @@ body:             |
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
 ...
 
-# W64-LABEL: name: bothen
-# W64-LABEL:  bb.0:
-# W64-NEXT: successors: %bb.1({{.*}})
-# W64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-# W64-LABEL: bb.1:
-# W64-NEXT: successors: %bb.2({{.*}})
-# W64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
-# W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64-LABEL: bb.2:
-# W64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
-# W64: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W64-LABEL:  bb.3:
-# W64: $exec = S_MOV_B64 [[SAVEEXEC]]
 
-# W32-LABEL: name: bothen
-# W32-LABEL:  bb.0:
-# W32-NEXT: successors: %bb.1({{.*}})
-# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
-# W32-LABEL: bb.1:
-# W32-NEXT: successors: %bb.2({{.*}})
-# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
-# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32-LABEL: bb.2:
-# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# TODO: S_XOR_B32_term should be `implicit-def $scc`
-# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
-# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W32-LABEL:  bb.3:
-# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]]
 ---
 name:            bothen
 liveins:
@@ -230,6 +266,100 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-LABEL: name: bothen
+    ; W64: successors: %bb.1(0x80000000)
+    ; W64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; W64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .1:
+    ; W64-NEXT: successors: %bb.2(0x80000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .2:
+    ; W64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; W64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W64-NEXT: {{  $}}
+    ; W64-NEXT: .3:
+    ; W64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
+    ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
+    ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W32-LABEL: name: bothen
+    ; W32: successors: %bb.1(0x80000000)
+    ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .1:
+    ; W32-NEXT: successors: %bb.2(0x80000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .2:
+    ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[COPY1]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+    ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .3:
+    ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+    ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
+    ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %5:sreg_64 = COPY $sgpr30_sgpr31
     %4:vreg_64 = COPY $vgpr4_vgpr5
     %3:vgpr_32 = COPY $vgpr3
@@ -243,17 +373,7 @@ body:             |
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
 ...
 
-# ADDR64-LABEL: name: addr64
-# ADDR64-LABEL:  bb.0:
-# ADDR64: %14:vreg_64 = COPY %8.sub0_sub1
-# ADDR64: %15:sreg_64 = S_MOV_B64 0
-# ADDR64: %16:sgpr_32 = S_MOV_B32 0
-# ADDR64: %17:sgpr_32 = S_MOV_B32 61440
-# ADDR64: %18:sgpr_128 = REG_SEQUENCE %15, %subreg.sub0_sub1, %16, %subreg.sub2, %17, %subreg.sub3
-# ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_CO_U32_e64 %14.sub0, %4.sub0, 0, implicit $exec
-# ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec
-# ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1
-# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, implicit $exec
+
 ---
 name:            addr64
 liveins:
@@ -266,6 +386,64 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; ADDR64-LABEL: name: addr64
+    ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; ADDR64-NEXT: {{  $}}
+    ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+    ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+    ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
+    ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
+    ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec
+    ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1
+    ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
+    ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
+    ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W32-LABEL: name: addr64
+    ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+    ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+    ; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696
+    ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
+    ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec
+    ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1
+    ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
+    ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
+    ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %5:sreg_64 = COPY $sgpr30_sgpr31
     %4:vreg_64 = COPY $vgpr4_vgpr5
     %3:vgpr_32 = COPY $vgpr3
@@ -279,66 +457,7 @@ body:             |
     S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
 ...
 
-# W64-LABEL: name: offset
-# W64-LABEL:  bb.0:
-
-# W64-NO-ADDR64: successors: %bb.1({{.*}})
-# W64-NO-ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W64-NO-ADDR64: [[SAVEEXEC:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-# W64-NO-ADDR64-LABEL: bb.1:
-# W64-NO-ADDR64-NEXT: successors: %bb.2({{.*}})
-# W64-NO-ADDR64: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W64-NO-ADDR64: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W64-NO-ADDR64: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W64-NO-ADDR64: [[CMP0:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W64-NO-ADDR64: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W64-NO-ADDR64: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W64-NO-ADDR64: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W64-NO-ADDR64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc
-# W64-NO-ADDR64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W64-NO-ADDR64-LABEL: bb.2:
-# W64-NO-ADDR64-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc
-# W64-NO-ADDR64: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W64-NO-ADDR64-LABEL:  bb.3:
-# W64-NO-ADDR64: $exec = S_MOV_B64 [[SAVEEXEC]]
-
-# W32: successors: %bb.1({{.*}})
-# W32: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# W32: [[SAVEEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
-# W32-LABEL: bb.1:
-# W32-NEXT: successors: %bb.2({{.*}})
-# W32: [[SRSRC0:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub0, implicit $exec
-# W32: [[SRSRC1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub1, implicit $exec
-# W32: [[STMP0:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1
-# W32: [[CMP0:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP0]], [[VRSRC]].sub0_sub1, implicit $exec
-# W32: [[SRSRC2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub2, implicit $exec
-# W32: [[SRSRC3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[VRSRC]].sub3, implicit $exec
-# W32: [[STMP1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[SRSRC2]], %subreg.sub0, [[SRSRC3]], %subreg.sub1
-# W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[STMP1]], [[VRSRC]].sub2_sub3, implicit $exec
-# W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc
-# W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3
-# W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec
-# W32-LABEL: bb.2:
-# W32-NEXT: successors: %bb.1({{.*}}), %bb.3({{.*}})
-# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, implicit $exec
-# TODO: S_XOR_B32_term should be `implicit-def $scc`
-# W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]]
-# W32: SI_WATERFALL_LOOP %bb.1, implicit $exec
-# W32-LABEL:  bb.3:
-# W32: $exec_lo = S_MOV_B32 [[SAVEEXEC]]
 
-# ADDR64: [[VRSRC:%[0-9]+]]:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
-# ADDR64: [[RSRCPTR:%[0-9]+]]:vreg_64 = COPY [[VRSRC]].sub0_sub1
-# ADDR64: [[ZERO64:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-# ADDR64: [[RSRCFMTLO:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-# ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
-# ADDR64: [[ZERORSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3
-# ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1
-# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, implicit $exec
 
 ---
 name:            offset
@@ -352,6 +471,127 @@ liveins:
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; ADDR64-LABEL: name: offset
+    ; ADDR64: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; ADDR64-NEXT: {{  $}}
+    ; ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
+    ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+    ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440
+    ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
+    ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1
+    ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec
+    ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
+    ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W64-NO-ADDR64-LABEL: name: offset
+    ; W64-NO-ADDR64: successors: %bb.1(0x80000000)
+    ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W64-NO-ADDR64-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; W64-NO-ADDR64-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W64-NO-ADDR64-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W64-NO-ADDR64-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W64-NO-ADDR64-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W64-NO-ADDR64-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: .1:
+    ; W64-NO-ADDR64-NEXT: successors: %bb.2(0x80000000)
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W64-NO-ADDR64-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W64-NO-ADDR64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W64-NO-ADDR64-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: .2:
+    ; W64-NO-ADDR64-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W64-NO-ADDR64-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+    ; W64-NO-ADDR64-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W64-NO-ADDR64-NEXT: {{  $}}
+    ; W64-NO-ADDR64-NEXT: .3:
+    ; W64-NO-ADDR64-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
+    ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]]
+    ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ; W32-LABEL: name: offset
+    ; W32: successors: %bb.1(0x80000000)
+    ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+    ; W32-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
+    ; W32-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; W32-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; W32-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; W32-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; W32-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+    ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec
+    ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec
+    ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
+    ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .1:
+    ; W32-NEXT: successors: %bb.2(0x80000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub0, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub1, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[REG_SEQUENCE]].sub0_sub1, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub2, implicit $exec
+    ; W32-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE]].sub3, implicit $exec
+    ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
+    ; W32-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[REG_SEQUENCE]].sub2_sub3, implicit $exec
+    ; W32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
+    ; W32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+    ; W32-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .2:
+    ; W32-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+    ; W32-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+    ; W32-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
+    ; W32-NEXT: {{  $}}
+    ; W32-NEXT: .3:
+    ; W32-NEXT: $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
+    ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
+    ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]]
+    ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
     %5:sreg_64 = COPY $sgpr30_sgpr31
     %4:vreg_64 = COPY $vgpr4_vgpr5
     %3:vgpr_32 = COPY $vgpr3

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 611c5bb3271b..0045fa68c4f0 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -1,25 +1,8 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 # Check that constant is in SGPR registers
 
-# GCN-LABEL: {{^}}name: const_to_sgpr{{$}}
-# GCN:        %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
-# GCN-NEXT:   %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
-# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec
-
-
-# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}}
-# GCN:        %[[HI:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
-# GCN-NEXT:   %[[LO:[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
-# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]]:sreg_64 = REG_SEQUENCE killed %[[LO]], %subreg.sub0, killed %[[HI]], %subreg.sub1
-# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec
-# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit $exec
-
-# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}}
-# GCN:       %[[OP0:[0-9]+]]:vreg_64 = REG_SEQUENCE killed %{{[0-9]+}}, %subreg.sub0, killed %{{[0-9]+}}, %subreg.sub1
-# GCN-NEXT:  V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit $exec
-
 --- |
   define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) {
   bb:
@@ -99,6 +82,44 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '%2' }
   - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' }
 body:             |
+  ; GCN-LABEL: name: const_to_sgpr
+  ; GCN: bb.0.bb:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; GCN-NEXT:   [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+  ; GCN-NEXT:   [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
+  ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U64_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.bb1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[V_LSHL_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_B64_e64 [[REG_SEQUENCE]], 2, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
+  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_3]], %subreg.sub0, killed [[S_MOV_B32_2]], %subreg.sub1
+  ; GCN-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE3]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE4]], 0, 0, 0, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.bb2:
+  ; GCN-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_ENDPGM 0
   bb.0.bb:
     successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -197,6 +218,50 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '%2' }
   - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' }
 body:             |
+  ; GCN-LABEL: name: const_to_sgpr_multiple_use
+  ; GCN: bb.0.bb:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 13, 0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY1]], implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; GCN-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_U32 [[REG_SEQUENCE]].sub0, [[S_LOAD_DWORDX2_IMM1]].sub0, implicit-def $scc
+  ; GCN-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32_xm0 = S_ADDC_U32 0, [[S_LOAD_DWORDX2_IMM1]].sub1, implicit-def dead $scc, implicit $scc
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_]], %subreg.sub0, killed [[S_ADDC_U32_]], %subreg.sub1
+  ; GCN-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32_xm0 = S_ADD_U32 [[REG_SEQUENCE]].sub0, [[S_LOAD_DWORDX2_IMM2]].sub0, implicit-def $scc
+  ; GCN-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32_xm0 = S_ADDC_U32 0, [[S_LOAD_DWORDX2_IMM2]].sub1, implicit-def dead $scc, implicit $scc
+  ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_ADD_U32_1]], %subreg.sub0, killed [[S_ADDC_U32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1048576, implicit $exec
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_MOV_B32_e32_]], %subreg.sub0, killed [[V_MOV_B32_e32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], implicit $exec
+  ; GCN-NEXT:   [[V_CMP_LT_U64_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], implicit $exec
+  ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 killed [[V_CMP_LT_U64_e64_]], killed [[V_CMP_LT_U64_e64_1]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_AND_B64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.bb1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[REG_SEQUENCE]], 2, implicit-def dead $scc
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
+  ; GCN-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_2]], %subreg.sub0, killed [[S_MOV_B32_1]], %subreg.sub1
+  ; GCN-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE4]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6
+  ; GCN-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_LSHL_B64_]]
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_2]], killed [[COPY3]], killed [[REG_SEQUENCE5]], 0, 0, 0, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.bb2:
+  ; GCN-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_ENDPGM 0
   bb.0.bb:
     successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -294,6 +359,41 @@ liveins:
   - { reg: '$vgpr0', virtual-reg: '%2' }
   - { reg: '$sgpr0_sgpr1', virtual-reg: '%3' }
 body:             |
+  ; GCN-LABEL: name: const_to_sgpr_subreg
+  ; GCN: bb.0.bb:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 11, 0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[V_MOV_B32_e32_]], %subreg.sub1
+  ; GCN-NEXT:   [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
+  ; GCN-NEXT:   [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec
+  ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
+  ; GCN-NEXT:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.bb1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[V_LSHL_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHL_B64_e64 [[REG_SEQUENCE]], 2, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 61440
+  ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
+  ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
+  ; GCN-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub1_sub2_sub3_sub4_sub5, killed [[REG_SEQUENCE2]], %subreg.sub1_sub2_sub3_sub4_sub5_sub6
+  ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 killed [[V_MOV_B32_e32_1]], [[V_LSHL_B64_e64_]], killed [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.bb2:
+  ; GCN-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   S_ENDPGM 0
   bb.0.bb:
     successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
     liveins: $vgpr0, $sgpr0_sgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 4ff74b63ebce..afd429dd079e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -97,9 +97,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -494,7 +494,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -524,7 +524,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -628,7 +628,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -664,7 +664,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -694,7 +694,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -731,7 +731,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -761,7 +761,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -828,7 +828,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -864,7 +864,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s10
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s6
@@ -926,7 +926,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
@@ -1161,7 +1161,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
@@ -1854,7 +1854,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -1880,7 +1880,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index d57bd390ab99..62754db64e0e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -3,11 +3,11 @@
 
 define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-LABEL: name: test_spill_av_class
-  ; GCN:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %21.sub0
+  ; GCN:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %22.sub0
   ; GCN-NEXT:   undef [[AV_REG:%[0-9]+]].sub0:av_64 = COPY %{{[0-9]+}}.sub0
   ; GCN-NEXT:   SI_SPILL_AV64_SAVE [[AV_REG]], %stack.0, $sgpr32, 0, implicit $exec
   ; GCN:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec
-  ; GCN-NEXT:   undef %22.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0
+  ; GCN-NEXT:   undef %23.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
   %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 8398cf0fe898..e9472a865fd9 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -375,8 +375,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e32_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_ADD_I32_24]], [[V_OR_B32_e32_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e32_67]], implicit $exec
-  ; CHECK-NEXT:   undef %692.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 %692, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
+  ; CHECK-NEXT:   undef %693.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 %693, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 1cfd336196c4..1c623549ffee 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -49,7 +49,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
@@ -202,8 +202,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s10, -1
@@ -473,7 +473,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -505,7 +505,7 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -544,7 +544,7 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -669,7 +669,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -701,7 +701,7 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -740,7 +740,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -772,7 +772,7 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -810,7 +810,7 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
@@ -910,7 +910,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -950,8 +950,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s15, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s14, v0
@@ -1078,15 +1078,13 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
 ; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_readfirstlane_b32 s10, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1094,6 +1092,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -1237,8 +1236,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1337,7 +1336,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v1, s9, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
@@ -1444,8 +1443,8 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
@@ -1994,7 +1993,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 6161fed68598..0c9bf2ac2f76 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -671,7 +671,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
 ; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
-; SI-NEXT:    v_sub_i32_e32 v14, vcc, v6, v2
+; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
 ; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
 ; SI-NEXT:    v_sub_i32_e32 v15, vcc, v7, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
@@ -1851,14 +1851,14 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; SI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
 ; SI-NEXT:    v_mul_hi_u32 v4, v1, v4
-; SI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; SI-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
 ; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, v2, v0
+; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
@@ -2352,7 +2352,7 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon
 ; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -2500,8 +2500,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_mul_hi_u32 v4, v2, s4
 ; SI-NEXT:    v_mul_lo_u32 v6, v3, s4
 ; SI-NEXT:    v_mul_lo_u32 v5, v2, s4
-; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; SI-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; SI-NEXT:    v_mul_lo_u32 v6, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v8, v2, v4
@@ -2598,8 +2598,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
-; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
+; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v3, v4
 ; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
@@ -2685,8 +2685,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
+; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_add_u32_e32 v8, vcc, v3, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
 ; GCN-NEXT:    v_add_u32_e32 v9, vcc, v5, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a59b5d011acf..861843867735 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -86,9 +86,9 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -723,9 +723,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
 ; GCN-NEXT:    v_mul_lo_u32 v5, s3, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
@@ -953,7 +953,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
@@ -1374,7 +1374,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -1547,7 +1547,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 3d21601f8d20..e0a6fd977c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -49,7 +49,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -86,8 +86,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
@@ -202,8 +202,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s10, -1
@@ -941,7 +941,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index eac4fd0563ca..39c717ae9a19 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -234,10 +234,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
   ; SI-NEXT: bb.1.Flow:
   ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.10(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %43:vgpr_32, %bb.0, %4, %bb.9
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.9
-  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.9
-  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.9
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9
+  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9
+  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
@@ -250,8 +250,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %51:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
-  ; SI-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %53:vgpr_32, %bb.4, [[PHI1]], %bb.2
+  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+  ; SI-NEXT:   [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -287,8 +287,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
-  ; SI-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.8, [[COPY4]], %bb.6
+  ; SI-NEXT:   [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+  ; SI-NEXT:   [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
@@ -357,9 +357,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
   ; SI-NEXT: bb.1.Flow:
   ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.10(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %44:vgpr_32, %bb.0, %4, %bb.9
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.9
-  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.9
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9
+  ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
@@ -372,7 +372,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %50:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+  ; SI-NEXT:   [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -408,7 +408,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   successors: %bb.8(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %52:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+  ; SI-NEXT:   [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
   ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
@@ -475,9 +475,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %50, 0, implicit $exec
-  ; SI-NEXT:   %43:vgpr_32, dead %45:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %43, %subreg.sub1
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec
+  ; SI-NEXT:   %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
   ; SI-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
@@ -503,14 +503,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
   ; SI-NEXT: bb.5.Flow:
   ; SI-NEXT:   successors: %bb.1(0x40000000), %bb.7(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %51:vgpr_32, %bb.6
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.1
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.6.sw.bb18:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5
@@ -576,8 +576,8 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 3, killed [[REG_SEQUENCE]], implicit $exec
   ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_LSHLREV_B64_e64_]].sub0, 0, implicit $exec
-  ; SI-NEXT:   %69:vgpr_32, dead %71:sreg_32_xm0_xexec = V_ADDC_U32_e64 killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %69, %subreg.sub1
+  ; SI-NEXT:   %85:vgpr_32, dead %87:sreg_32_xm0_xexec = V_ADDC_U32_e64 killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_LSHLREV_B64_e64_]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %85, %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 killed [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_]], 16, 0, implicit $exec :: (dereferenceable invariant load (s128) from %ir.6 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
@@ -596,17 +596,27 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.
   ; SI-NEXT: bb.2:
   ; SI-NEXT:   successors: %bb.3(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec
   ; SI-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; SI-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc
+  ; SI-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_4]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub7
+  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -616,32 +626,22 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.
   ; SI-NEXT: bb.4:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE7]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_2]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_4]], implicit-def dead $scc
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_2]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc
-  ; SI-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_4]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_8]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub7
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc
+  ; SI-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_8]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub3
   ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_3]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5:
   ; SI-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, killed [[REG_SEQUENCE10]], [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource")
+  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, [[REG_SEQUENCE7]], killed [[REG_SEQUENCE10]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource")
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
   ; SI-NEXT:   SI_WATERFALL_LOOP %bb.4, implicit $exec
   ; SI-NEXT: {{  $}}

diff  --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 16de2cbe761f..29f8c60ad281 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -431,9 +431,12 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
@@ -458,6 +461,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_add3_u32 v0, v0, v1, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
@@ -465,6 +469,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
@@ -578,6 +583,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v10, 5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
@@ -712,6 +720,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    s_mov_b32 s35, 0x7fffffff
@@ -728,6 +739,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
@@ -751,6 +765,13 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr35
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $exec
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10


        


More information about the llvm-commits mailing list